import os
import unittest
from support import html5lib_test_files, TestData, test_dir
from html5lib import HTMLParser, inputstream
import re, unittest
class Html5EncodingTestCase(unittest.TestCase):
def test_codec_name(self):
self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
self.assertEquals(inputstream.codecName("utf8"), "utf-8")
self.assertEquals(inputstream.codecName(" utf8 "), "utf-8")
self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
def buildTestSuite():
for filename in html5lib_test_files("encoding"):
test_name = os.path.basename(filename).replace('.dat',''). \
replace('-','')
tests = TestData(filename, "data")
for idx, test in enumerate(tests):
def encodingTest(self, data=test['data'],
encoding=test['encoding']):
p = HTMLParser()
t = p.parse(data, useChardet=False)
errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
(data, repr(encoding.lower()),
repr(p.tokenizer.stream.charEncoding)))
self.assertEquals(encoding.lower(),
p.tokenizer.stream.charEncoding[0],
errorMessage)
setattr(Html5EncodingTestCase, 'test_%s_%d' % (test_name, idx+1),
encodingTest)
try:
import chardet
def test_chardet(self):
data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read()
encoding = inputstream.HTMLInputStream(data).charEncoding
assert encoding[0].lower() == "big5"
setattr(Html5EncodingTestCase, 'test_chardet', test_chardet)
except ImportError:
print "chardet not found, skipping chardet tests"
return unittest.defaultTestLoader.loadTestsFromName(__name__)
def main():
buildTestSuite()
unittest.main()
if __name__ == "__main__":
main()