import os import sys import StringIO import unittest import warnings warnings.simplefilter("error") from support import html5lib_test_files, TestData, convertExpected from html5lib import html5parser, treewalkers, treebuilders, constants from html5lib.filters.lint import Filter as LintFilter, LintError def PullDOMAdapter(node): from xml.dom import Node from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, COMMENT, CHARACTERS if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): for childNode in node.childNodes: for event in PullDOMAdapter(childNode): yield event elif node.nodeType == Node.DOCUMENT_TYPE_NODE: raise NotImplementedError("DOCTYPE nodes are not supported by PullDOM") elif node.nodeType == Node.COMMENT_NODE: yield COMMENT, node elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): yield CHARACTERS, node elif node.nodeType == Node.ELEMENT_NODE: yield START_ELEMENT, node for childNode in node.childNodes: for event in PullDOMAdapter(childNode): yield event yield END_ELEMENT, node else: raise NotImplementedError("Node type not supported: " + str(node.nodeType)) treeTypes = { "simpletree": {"builder": treebuilders.getTreeBuilder("simpletree"), "walker": treewalkers.getTreeWalker("simpletree")}, "DOM": {"builder": treebuilders.getTreeBuilder("dom"), "walker": treewalkers.getTreeWalker("dom")}, "PullDOM": {"builder": treebuilders.getTreeBuilder("dom"), "adapter": PullDOMAdapter, "walker": treewalkers.getTreeWalker("pulldom")}, } #Try whatever etree implementations are available from a list that are #"supposed" to work try: import xml.etree.ElementTree as ElementTree treeTypes['ElementTree'] = \ {"builder": treebuilders.getTreeBuilder("etree", ElementTree), "walker": treewalkers.getTreeWalker("etree", ElementTree)} except ImportError: try: import elementtree.ElementTree as ElementTree treeTypes['ElementTree'] = \ {"builder": treebuilders.getTreeBuilder("etree", ElementTree), "walker": treewalkers.getTreeWalker("etree", ElementTree)} except ImportError: pass try: import xml.etree.cElementTree as ElementTree treeTypes['cElementTree'] = \ {"builder": treebuilders.getTreeBuilder("etree", ElementTree), "walker": treewalkers.getTreeWalker("etree", ElementTree)} except ImportError: try: import cElementTree as ElementTree treeTypes['cElementTree'] = \ {"builder": treebuilders.getTreeBuilder("etree", ElementTree), "walker": treewalkers.getTreeWalker("etree", ElementTree)} except ImportError: pass try: import lxml.etree as ElementTree # treeTypes['lxml_as_etree'] = \ # {"builder": treebuilders.getTreeBuilder("etree", ElementTree), # "walker": treewalkers.getTreeWalker("etree", ElementTree)} treeTypes['lxml_native'] = \ {"builder": treebuilders.getTreeBuilder("lxml"), "walker": treewalkers.getTreeWalker("lxml")} except ImportError: pass try: import BeautifulSoup treeTypes["beautifulsoup"] = \ {"builder": treebuilders.getTreeBuilder("beautifulsoup"), "walker": treewalkers.getTreeWalker("beautifulsoup")} except ImportError: pass #Try whatever etree implementations are available from a list that are #"supposed" to work try: import pxdom treeTypes['pxdom'] = \ {"builder": treebuilders.getTreeBuilder("dom", pxdom), "walker": treewalkers.getTreeWalker("dom")} except ImportError: pass try: from genshi.core import QName, Attrs from genshi.core import START, END, TEXT, COMMENT, DOCTYPE def GenshiAdapter(tree): text = None for token in treewalkers.getTreeWalker("simpletree")(tree): type = token["type"] if type in ("Characters", "SpaceCharacters"): if text is None: text = token["data"] else: text += token["data"] elif text is not None: yield TEXT, text, (None, -1, -1) text = None if type in ("StartTag", "EmptyTag"): if token["namespace"]: name = u"{%s}%s" % (token["namespace"], token["name"]) else: name = token["name"] yield (START, (QName(name), Attrs([(QName(attr),value) for attr,value in token["data"]])), (None, -1, -1)) if type == "EmptyTag": type = "EndTag" if type == "EndTag": yield END, QName(token["name"]), (None, -1, -1) elif type == "Comment": yield COMMENT, token["data"], (None, -1, -1) elif type == "Doctype": yield DOCTYPE, (token["name"], token["publicId"], token["systemId"]), (None, -1, -1) else: pass # FIXME: What to do? if text is not None: yield TEXT, text, (None, -1, -1) #treeTypes["genshi"] = \ # {"builder": treebuilders.getTreeBuilder("simpletree"), # "adapter": GenshiAdapter, # "walker": treewalkers.getTreeWalker("genshi")} except ImportError: pass def concatenateCharacterTokens(tokens): charactersToken = None for token in tokens: type = token["type"] if type in ("Characters", "SpaceCharacters"): if charactersToken is None: charactersToken = {"type": "Characters", "data": token["data"]} else: charactersToken["data"] += token["data"] else: if charactersToken is not None: yield charactersToken charactersToken = None yield token if charactersToken is not None: yield charactersToken def convertTokens(tokens): output = [] indent = 0 for token in concatenateCharacterTokens(tokens): type = token["type"] if type in ("StartTag", "EmptyTag"): if (token["namespace"] and token["namespace"] != constants.namespaces["html"]): if token["namespace"] in constants.prefixes: name = constants.prefixes[token["namespace"]] else: name = token["namespace"] name += u" " + token["name"] else: name = token["name"] output.append(u"%s<%s>" % (" "*indent, name)) indent += 2 attrs = token["data"] if attrs: #TODO: Remove this if statement, attrs should always exist for (namespace,name),value in sorted(attrs.items()): if namespace: if namespace in constants.prefixes: outputname = constants.prefixes[namespace] else: outputname = namespace outputname += u" " + name else: outputname = name output.append(u"%s%s=\"%s\"" % (" "*indent, outputname, value)) if type == "EmptyTag": indent -= 2 elif type == "EndTag": indent -= 2 elif type == "Comment": output.append("%s" % (" "*indent, token["data"])) elif type == "Doctype": if token["name"]: if token["publicId"]: output.append("""%s"""% (" "*indent, token["name"], token["publicId"], token["systemId"] and token["systemId"] or "")) elif token["systemId"]: output.append("""%s"""% (" "*indent, token["name"], token["systemId"])) else: output.append("%s"%(" "*indent, token["name"])) else: output.append("%s" % (" "*indent,)) elif type in ("Characters", "SpaceCharacters"): output.append("%s\"%s\"" % (" "*indent, token["data"])) else: pass # TODO: what to do with errors? return u"\n".join(output) import re attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+",re.M) def sortattrs(x): lines = x.group(0).split("\n") lines.sort() return "\n".join(lines) class TokenTestCase(unittest.TestCase): def test_all_tokens(self): expected = [ {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'}, {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'}, {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'}, {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'}, {'data': u'a', 'type': 'Characters'}, {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'}, {'data': u'b', 'type': 'Characters'}, {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'}, {'data': u'c', 'type': 'Characters'}, {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'}, {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'} ] for treeName, treeCls in treeTypes.iteritems(): p = html5parser.HTMLParser(tree = treeCls["builder"]) document = p.parse("a
b
c") document = treeCls.get("adapter", lambda x: x)(document) output = treeCls["walker"](document) for expectedToken, outputToken in zip(expected, output): self.assertEquals(expectedToken, outputToken) def run_test(innerHTML, input, expected, errors, treeClass): try: p = html5parser.HTMLParser(tree = treeClass["builder"]) if innerHTML: document = p.parseFragment(StringIO.StringIO(input), innerHTML) else: document = p.parse(StringIO.StringIO(input)) except constants.DataLossWarning: #Ignore testcases we know we don't pass return document = treeClass.get("adapter", lambda x: x)(document) try: output = convertTokens(treeClass["walker"](document)) output = attrlist.sub(sortattrs, output) expected = attrlist.sub(sortattrs, convertExpected(expected)) assert expected == output, "\n".join([ "", "Input:", input, "", "Expected:", expected, "", "Received:", output ]) except NotImplementedError: pass # Amnesty for those that confess... def test_treewalker(): sys.stdout.write('Testing tree walkers '+ " ".join(treeTypes.keys()) + "\n") for treeName, treeCls in treeTypes.iteritems(): files = html5lib_test_files('tree-construction') for filename in files: testName = os.path.basename(filename).replace(".dat","") tests = TestData(filename, "data") for index, test in enumerate(tests): (input, errors, innerHTML, expected) = [test[key] for key in ("data", "errors", "document-fragment", "document")] errors = errors.split("\n") yield run_test, innerHTML, input, expected, errors, treeCls