import os import sys import traceback import StringIO import warnings import re warnings.simplefilter("error") from support import html5lib_test_files as data_files from support import TestData, convert, convertExpected import html5lib from html5lib import html5parser, treebuilders, constants treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"), "DOM":treebuilders.getTreeBuilder("dom")} #Try whatever etree implementations are avaliable from a list that are #"supposed" to work try: import xml.etree.ElementTree as ElementTree treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True) except ImportError: try: import elementtree.ElementTree as ElementTree treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True) except ImportError: pass try: import xml.etree.cElementTree as cElementTree treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True) except ImportError: try: import cElementTree treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True) except ImportError: pass try: try: import lxml.html as lxml except ImportError: import lxml.etree as lxml treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml", lxml, fullTree=True) except ImportError: pass try: import BeautifulSoup treeTypes["beautifulsoup"] = treebuilders.getTreeBuilder("beautifulsoup", fullTree=True) except ImportError: pass #Try whatever dom implementations are avaliable from a list that are #"supposed" to work try: import pxdom treeTypes["pxdom"] = treebuilders.getTreeBuilder("dom", pxdom) except ImportError: pass #Run the parse error checks checkParseErrors = False #XXX - There should just be one function here but for some reason the testcase #format differs from the treedump format by a single space character def convertTreeDump(data): return "\n".join(convert(3)(data).split("\n")[1:]) namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub def runParserTest(innerHTML, input, expected, errors, treeClass, namespaceHTMLElements): #XXX - move this out into the setup function #concatenate all consecutive character tokens into a single token try: p = html5parser.HTMLParser(tree = treeClass, namespaceHTMLElements=namespaceHTMLElements) except constants.DataLossWarning: return try: if innerHTML: document = p.parseFragment(input, innerHTML) else: try: document = p.parse(input) except constants.DataLossWarning: return except: errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected, u"\nTraceback:", traceback.format_exc()]) assert False, errorMsg.encode("utf8") output = convertTreeDump(p.tree.testSerializer(document)) expected = convertExpected(expected) if namespaceHTMLElements: expected = namespaceExpected(r"\1", expected) errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected, u"\nReceived:", output]) assert expected == output, errorMsg.encode("utf8") errStr = [u"Line: %i Col: %i %s"%(line, col, constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for ((line,col), errorcode, datavars) in p.errors] errorMsg2 = u"\n".join([u"\n\nInput:", input, u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors), u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)]) if checkParseErrors: assert len(p.errors) == len(errors), errorMsg2.encode("utf-8") def test_parser(): sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n") files = data_files('tree-construction') for filename in files: testName = os.path.basename(filename).replace(".dat","") tests = TestData(filename, "data") for index, test in enumerate(tests): input, errors, innerHTML, expected = [test[key] for key in 'data', 'errors', 'document-fragment', 'document'] if errors: errors = errors.split("\n") for treeName, treeCls in treeTypes.iteritems(): for namespaceHTMLElements in (True, False): print input yield (runParserTest, innerHTML, input, expected, errors, treeCls, namespaceHTMLElements) break