Further improved memory handling of bs4 for torrent providers.

2024-12-13 11:32:20 -05:00 · 2014-07-21 16:01:46 -07:00 · 2014-07-21 16:01:46 -07:00 · 403c267953
commit 403c267953
parent 4a29476415
136 changed files with 5348 additions and 80311 deletions
--- a/lib/html5lib/init.py
+++ b/lib/html5lib/init.py
@ -10,8 +10,14 @@ import html5lib
 f = open("my_document.html")
 tree = html5lib.parse(f)
 """
-__version__ = "0.95-dev"
+
-from html5parser import HTMLParser, parse, parseFragment
+from __future__ import absolute_import, division, unicode_literals
-from treebuilders import getTreeBuilder
+
-from treewalkers import getTreeWalker
+from .html5parser import HTMLParser, parse, parseFragment
-from serializer import serialize
+from .treebuilders import getTreeBuilder
 from .treewalkers import getTreeWalker
 from .serializer import serialize
 __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
           "getTreeWalker", "serialize"]
 __version__ = "0.999"
--- a/lib/html5lib/constants.py
+++ b/lib/html5lib/constants.py
--- a/lib/html5lib/filters/_base.py
+++ b/lib/html5lib/filters/_base.py
@ -1,3 +1,5 @@
 from __future__ import absolute_import, division, unicode_literals
 class Filter(object):
    def __init__(self, source):
--- a/lib/html5lib/filters/alphabeticalattributes.py
+++ b/lib/html5lib/filters/alphabeticalattributes.py
@ -0,0 +1,20 @@
 from __future__ import absolute_import, division, unicode_literals
 from . import _base
 try:
    from collections import OrderedDict
 except ImportError:
    from ordereddict import OrderedDict
 class Filter(_base.Filter):
    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            if token["type"] in ("StartTag", "EmptyTag"):
                attrs = OrderedDict()
                for name, value in sorted(token["data"].items(),
                                          key=lambda x: x[0]):
                    attrs[name] = value
                token["data"] = attrs
            yield token
--- a/lib/html5lib/filters/formfiller.py
+++ b/lib/html5lib/filters/formfiller.py
@ -1,127 +0,0 @@
 #
 # The goal is to finally have a form filler where you pass data for
 # each form, using the algorithm for "Seeding a form with initial values"
 # See http://www.whatwg.org/specs/web-forms/current-work/#seeding
 #
 import _base
 from html5lib.constants import spaceCharacters
 spaceCharacters = u"".join(spaceCharacters)
 class SimpleFilter(_base.Filter):
    def __init__(self, source, fieldStorage):
        _base.Filter.__init__(self, source)
        self.fieldStorage = fieldStorage
    def __iter__(self):
        field_indices = {}
        state = None
        field_name = None
        for token in _base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
                name = token["name"].lower()
                if name == "input":
                    field_name = None
                    field_type = None
                    input_value_index = -1
                    input_checked_index = -1
                    for i,(n,v) in enumerate(token["data"]):
                        n = n.lower()
                        if n == u"name":
                            field_name = v.strip(spaceCharacters)
                        elif n == u"type":
                            field_type = v.strip(spaceCharacters)
                        elif n == u"checked":
                            input_checked_index = i
                        elif n == u"value":
                            input_value_index = i
                    value_list = self.fieldStorage.getlist(field_name)
                    field_index = field_indices.setdefault(field_name, 0)
                    if field_index < len(value_list):
                        value = value_list[field_index]
                    else:
                        value = ""
                    if field_type in (u"checkbox", u"radio"):
                        if value_list:
                            if token["data"][input_value_index][1] == value:
                                if input_checked_index < 0:
                                    token["data"].append((u"checked", u""))
                                field_indices[field_name] = field_index + 1
                            elif input_checked_index >= 0:
                                del token["data"][input_checked_index]
                    elif field_type not in (u"button", u"submit", u"reset"):
                        if input_value_index >= 0:
                            token["data"][input_value_index] = (u"value", value)
                        else:
                            token["data"].append((u"value", value))
                        field_indices[field_name] = field_index + 1
                    field_type = None
                    field_name = None
                elif name == "textarea":
                    field_type = "textarea"
                    field_name = dict((token["data"])[::-1])["name"]
                elif name == "select":
                    field_type = "select"
                    attributes = dict(token["data"][::-1])
                    field_name = attributes.get("name")
                    is_select_multiple = "multiple" in attributes
                    is_selected_option_found = False
                elif field_type == "select" and field_name and name == "option":
                    option_selected_index = -1
                    option_value = None
                    for i,(n,v) in enumerate(token["data"]):
                        n = n.lower()
                        if n == "selected":
                            option_selected_index = i
                        elif n == "value":
                            option_value = v.strip(spaceCharacters)
                    if option_value is None:
                        raise NotImplementedError("<option>s without a value= attribute")
                    else:
                        value_list = self.fieldStorage.getlist(field_name)
                        if value_list:
                            field_index = field_indices.setdefault(field_name, 0)
                            if field_index < len(value_list):
                                value = value_list[field_index]
                            else:
                                value = ""
                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
                                if option_selected_index < 0:
                                    token["data"].append((u"selected", u""))
                                field_indices[field_name] = field_index + 1
                                is_selected_option_found = True
                            elif option_selected_index >= 0:
                                del token["data"][option_selected_index]
            elif field_type is not None and field_name and type == "EndTag":
                name = token["name"].lower()
                if name == field_type:
                    if name == "textarea":
                        value_list = self.fieldStorage.getlist(field_name)
                        if value_list:
                            field_index = field_indices.setdefault(field_name, 0)
                            if field_index < len(value_list):
                                value = value_list[field_index]
                            else:
                                value = ""
                            yield {"type": "Characters", "data": value}
                            field_indices[field_name] = field_index + 1
                    field_name = None
                elif name == "option" and field_type == "select":
                    pass # TODO: part of "option without value= attribute" processing
            elif field_type == "textarea":
                continue # ignore token
            yield token
--- a/lib/html5lib/filters/inject_meta_charset.py
+++ b/lib/html5lib/filters/inject_meta_charset.py
@ -1,4 +1,7 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
 from . import _base
 class Filter(_base.Filter):
    def __init__(self, source, encoding):
@ -13,44 +16,44 @@ class Filter(_base.Filter):
        for token in _base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag":
-                if token["name"].lower() == u"head":
+                if token["name"].lower() == "head":
                    state = "in_head"
            elif type == "EmptyTag":
-                if token["name"].lower() == u"meta":
+                if token["name"].lower() == "meta":
                    # replace charset with actual encoding
                    has_http_equiv_content_type = False
-                   for (namespace,name),value in token["data"].iteritems():
+                    for (namespace, name), value in token["data"].items():
-                       if namespace != None:
+                        if namespace is not None:
                            continue
-                       elif name.lower() == u'charset':
+                        elif name.lower() == 'charset':
                            token["data"][(namespace, name)] = self.encoding
                            meta_found = True
                            break
-                       elif name == u'http-equiv' and value.lower() == u'content-type':
+                        elif name == 'http-equiv' and value.lower() == 'content-type':
                            has_http_equiv_content_type = True
                    else:
-                       if has_http_equiv_content_type and (None, u"content") in token["data"]:
+                        if has_http_equiv_content_type and (None, "content") in token["data"]:
-                           token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
+                            token["data"][(None, "content")] = 'text/html; charset=%s' % self.encoding
                            meta_found = True
-                elif token["name"].lower() == u"head" and not meta_found:
+                elif token["name"].lower() == "head" and not meta_found:
                    # insert meta into empty head
-                    yield {"type": "StartTag", "name": u"head",
+                    yield {"type": "StartTag", "name": "head",
                           "data": token["data"]}
-                    yield {"type": "EmptyTag", "name": u"meta",
+                    yield {"type": "EmptyTag", "name": "meta",
-                           "data": {(None, u"charset"): self.encoding}}
+                           "data": {(None, "charset"): self.encoding}}
-                    yield {"type": "EndTag", "name": u"head"}
+                    yield {"type": "EndTag", "name": "head"}
                    meta_found = True
                    continue
            elif type == "EndTag":
-                if token["name"].lower() == u"head" and pending:
+                if token["name"].lower() == "head" and pending:
                    # insert meta into head (if necessary) and flush pending queue
                    yield pending.pop(0)
                    if not meta_found:
-                        yield {"type": "EmptyTag", "name": u"meta",
+                        yield {"type": "EmptyTag", "name": "meta",
-                               "data": {(None, u"charset"): self.encoding}}
+                               "data": {(None, "charset"): self.encoding}}
                    while pending:
                        yield pending.pop(0)
                    meta_found = True
--- a/lib/html5lib/filters/lint.py
+++ b/lib/html5lib/filters/lint.py
@ -1,13 +1,18 @@
 from __future__ import absolute_import, division, unicode_literals
 from gettext import gettext
 _ = gettext
-import _base
+from . import _base
-from html5lib.constants import cdataElements, rcdataElements, voidElements
+from ..constants import cdataElements, rcdataElements, voidElements
-from html5lib.constants import spaceCharacters
+from ..constants import spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 class LintError(Exception):
    pass
 class LintError(Exception): pass
 class Filter(_base.Filter):
    def __iter__(self):
@ -18,24 +23,24 @@ class Filter(_base.Filter):
            if type in ("StartTag", "EmptyTag"):
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+                    raise LintError(_("StartTag not in PCDATA content model flag: %(tag)s") % {"tag": name})
-                if not isinstance(name, unicode):
+                if not isinstance(name, str):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                if not name:
-                    raise LintError(_(u"Empty tag name"))
+                    raise LintError(_("Empty tag name"))
                if type == "StartTag" and name in voidElements:
-                    raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+                    raise LintError(_("Void element reported as StartTag token: %(tag)s") % {"tag": name})
                elif type == "EmptyTag" and name not in voidElements:
-                    raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+                    raise LintError(_("Non-void element reported as EmptyTag token: %(tag)s") % {"tag": token["name"]})
                if type == "StartTag":
                    open_elements.append(name)
                for name, value in token["data"]:
-                    if not isinstance(name, unicode):
+                    if not isinstance(name, str):
-                        raise LintError(_("Attribute name is not a string: %r") % name)
+                        raise LintError(_("Attribute name is not a string: %(name)r") % {"name": name})
                    if not name:
-                        raise LintError(_(u"Empty attribute name"))
+                        raise LintError(_("Empty attribute name"))
-                    if not isinstance(value, unicode):
+                    if not isinstance(value, str):
-                        raise LintError(_("Attribute value is not a string: %r") % value)
+                        raise LintError(_("Attribute value is not a string: %(value)r") % {"value": value})
                if name in cdataElements:
                    contentModelFlag = "CDATA"
                elif name in rcdataElements:
@ -45,15 +50,15 @@ class Filter(_base.Filter):
            elif type == "EndTag":
                name = token["name"]
-                if not isinstance(name, unicode):
+                if not isinstance(name, str):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                if not name:
-                    raise LintError(_(u"Empty tag name"))
+                    raise LintError(_("Empty tag name"))
                if name in voidElements:
-                    raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+                    raise LintError(_("Void element reported as EndTag token: %(tag)s") % {"tag": name})
                start_name = open_elements.pop()
                if start_name != name:
-                    raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+                    raise LintError(_("EndTag (%(end)s) does not match StartTag (%(start)s)") % {"end": name, "start": start_name})
                contentModelFlag = "PCDATA"
            elif type == "Comment":
@ -62,27 +67,27 @@ class Filter(_base.Filter):
            elif type in ("Characters", "SpaceCharacters"):
                data = token["data"]
-                if not isinstance(data, unicode):
+                if not isinstance(data, str):
-                    raise LintError(_("Attribute name is not a string: %r") % data)
+                    raise LintError(_("Attribute name is not a string: %(name)r") % {"name": data})
                if not data:
-                    raise LintError(_(u"%s token with empty data") % type)
+                    raise LintError(_("%(type)s token with empty data") % {"type": type})
                if type == "SpaceCharacters":
                    data = data.strip(spaceCharacters)
                    if data:
-                        raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+                        raise LintError(_("Non-space character(s) found in SpaceCharacters token: %(token)r") % {"token": data})
            elif type == "Doctype":
                name = token["name"]
                if contentModelFlag != "PCDATA":
-                    raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+                    raise LintError(_("Doctype not in PCDATA content model flag: %(name)s") % {"name": name})
-                if not isinstance(name, unicode):
+                if not isinstance(name, str):
-                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                    raise LintError(_("Tag name is not a string: %(tag)r") % {"tag": name})
                # XXX: what to do with token["data"] ?
            elif type in ("ParseError", "SerializeError"):
                pass
            else:
-                raise LintError(_(u"Unknown token type: %s") % type)
+                raise LintError(_("Unknown token type: %(type)s") % {"type": type})
            yield token
--- a/lib/html5lib/filters/optionaltags.py
+++ b/lib/html5lib/filters/optionaltags.py
@ -1,4 +1,7 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
 from . import _base
 class Filter(_base.Filter):
    def slider(self):
--- a/lib/html5lib/filters/sanitizer.py
+++ b/lib/html5lib/filters/sanitizer.py
@ -1,8 +1,12 @@
-import _base
+from __future__ import absolute_import, division, unicode_literals
-from html5lib.sanitizer import HTMLSanitizerMixin
+
 from . import _base
 from ..sanitizer import HTMLSanitizerMixin
 class Filter(_base.Filter, HTMLSanitizerMixin):
    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            token = self.sanitize_token(token)
-            if token: yield token
+            if token:
                yield token
--- a/lib/html5lib/filters/whitespace.py
+++ b/lib/html5lib/filters/whitespace.py
@ -1,16 +1,13 @@
-try:
+from __future__ import absolute_import, division, unicode_literals
    frozenset
 except NameError:
    # Import from the sets module for python 2.3
    from sets import ImmutableSet as frozenset
 import re
-import _base
+from . import _base
-from html5lib.constants import rcdataElements, spaceCharacters
+from ..constants import rcdataElements, spaceCharacters
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)
 SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
 class Filter(_base.Filter):
@ -29,13 +26,13 @@ class Filter(_base.Filter):
            elif not preserve and type == "SpaceCharacters" and token["data"]:
                # Test on token["data"] above to not introduce spaces where there were not
-                token["data"] = u" "
+                token["data"] = " "
            elif not preserve and type == "Characters":
                token["data"] = collapse_spaces(token["data"])
            yield token
 def collapse_spaces(text):
    return SPACES_REGEX.sub(' ', text)
--- a/lib/html5lib/html5parser.py
+++ b/lib/html5lib/html5parser.py
@ -1,82 +1,58 @@
-try:
+from __future__ import absolute_import, division, unicode_literals
-    frozenset
+from six import with_metaclass
 except NameError:
    # Import from the sets module for python 2.3
    from sets import Set as set
    from sets import ImmutableSet as frozenset
 try:
    any
 except:
    # Implement 'any' for python 2.4 and previous
    def any(iterable):
        for element in iterable:
            if element:
                return True
        return False
 try:
    "abc".startswith(("a", "b"))
    def startswithany(str, prefixes):
        return str.startswith(prefixes)
 except:
    # Python 2.4 doesn't accept a tuple as argument to string startswith
    def startswithany(str, prefixes):
        for prefix in prefixes:
            if str.startswith(prefix):
                return True
        return False
 import sys
 import types
-import inputstream
+from . import inputstream
-import tokenizer
+from . import tokenizer
-import treebuilders
+from . import treebuilders
-from treebuilders._base import Marker
+from .treebuilders._base import Marker
 from treebuilders import simpletree
-import utils
+from . import utils
-import constants
+from . import constants
-from constants import spaceCharacters, asciiUpper2Lower
+from .constants import spaceCharacters, asciiUpper2Lower
-from constants import formattingElements, specialElements
+from .constants import specialElements
-from constants import headingElements, tableInsertModeElements
+from .constants import headingElements
-from constants import cdataElements, rcdataElements, voidElements
+from .constants import cdataElements, rcdataElements
-from constants import tokenTypes, ReparseException, namespaces, spaceCharacters
+from .constants import tokenTypes, ReparseException, namespaces
-from constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
+from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
 from .constants import adjustForeignAttributes as adjustForeignAttributesMap
-def parse(doc, treebuilder="simpletree", encoding=None,
+
 def parse(doc, treebuilder="etree", encoding=None,
          namespaceHTMLElements=True):
    """Parse a string or file-like object into a tree"""
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
    return p.parse(doc, encoding=encoding)
-def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None, 
+
 def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
                  namespaceHTMLElements=True):
    tb = treebuilders.getTreeBuilder(treebuilder)
    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
    return p.parseFragment(doc, container=container, encoding=encoding)
 def method_decorator_metaclass(function):
    class Decorated(type):
        def __new__(meta, classname, bases, classDict):
-            for attributeName, attribute in classDict.iteritems():
+            for attributeName, attribute in classDict.items():
-                if type(attribute) == types.FunctionType:
+                if isinstance(attribute, types.FunctionType):
                    attribute = function(attribute)
                classDict[attributeName] = attribute
            return type.__new__(meta, classname, bases, classDict)
    return Decorated
 class HTMLParser(object):
    """HTML parser. Generates a tree structure from a stream of (possibly
        malformed) HTML"""
-    def __init__(self, tree = simpletree.TreeBuilder,
+    def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
-                 tokenizer = tokenizer.HTMLTokenizer, strict = False,
+                 strict=False, namespaceHTMLElements=True, debug=False):
                 namespaceHTMLElements = True, debug=False):
        """
        strict - raise an exception when a parse error is encountered
@ -92,12 +68,14 @@ class HTMLParser(object):
        # Raise an exception on the first error encountered
        self.strict = strict
        if tree is None:
            tree = treebuilders.getTreeBuilder("etree")
        self.tree = tree(namespaceHTMLElements)
        self.tokenizer_class = tokenizer
        self.errors = []
        self.phases = dict([(name, cls(self, self.tree)) for name, cls in
-                            getPhases(debug).iteritems()])
+                            getPhases(debug).items()])
    def _parse(self, stream, innerHTML=False, container="div",
               encoding=None, parseMeta=True, useChardet=True, **kwargs):
@ -114,7 +92,7 @@ class HTMLParser(object):
            try:
                self.mainLoop()
                break
-            except ReparseException, e:
+            except ReparseException:
                self.reset()
    def reset(self):
@ -219,7 +197,6 @@ class HTMLParser(object):
                self.parseError("non-void-element-with-trailing-solidus",
                                {"name": token["name"]})
        # When the loop finishes it's EOF
        reprocess = True
        phases = []
@ -279,100 +256,87 @@ class HTMLParser(object):
        return token
    def adjustMathMLAttributes(self, token):
-        replacements = {"definitionurl":u"definitionURL"}
+        replacements = {"definitionurl": "definitionURL"}
-        for k,v in replacements.iteritems():
+        for k, v in replacements.items():
            if k in token["data"]:
                token["data"][v] = token["data"][k]
                del token["data"][k]
    def adjustSVGAttributes(self, token):
        replacements = {
-            "attributename":u"attributeName",
+            "attributename": "attributeName",
-            "attributetype":u"attributeType",
+            "attributetype": "attributeType",
-            "basefrequency":u"baseFrequency",
+            "basefrequency": "baseFrequency",
-            "baseprofile":u"baseProfile",
+            "baseprofile": "baseProfile",
-            "calcmode":u"calcMode",
+            "calcmode": "calcMode",
-            "clippathunits":u"clipPathUnits",
+            "clippathunits": "clipPathUnits",
-            "contentscripttype":u"contentScriptType",
+            "contentscripttype": "contentScriptType",
-            "contentstyletype":u"contentStyleType",
+            "contentstyletype": "contentStyleType",
-            "diffuseconstant":u"diffuseConstant",
+            "diffuseconstant": "diffuseConstant",
-            "edgemode":u"edgeMode",
+            "edgemode": "edgeMode",
-            "externalresourcesrequired":u"externalResourcesRequired",
+            "externalresourcesrequired": "externalResourcesRequired",
-            "filterres":u"filterRes",
+            "filterres": "filterRes",
-            "filterunits":u"filterUnits",
+            "filterunits": "filterUnits",
-            "glyphref":u"glyphRef",
+            "glyphref": "glyphRef",
-            "gradienttransform":u"gradientTransform",
+            "gradienttransform": "gradientTransform",
-            "gradientunits":u"gradientUnits",
+            "gradientunits": "gradientUnits",
-            "kernelmatrix":u"kernelMatrix",
+            "kernelmatrix": "kernelMatrix",
-            "kernelunitlength":u"kernelUnitLength",
+            "kernelunitlength": "kernelUnitLength",
-            "keypoints":u"keyPoints",
+            "keypoints": "keyPoints",
-            "keysplines":u"keySplines",
+            "keysplines": "keySplines",
-            "keytimes":u"keyTimes",
+            "keytimes": "keyTimes",
-            "lengthadjust":u"lengthAdjust",
+            "lengthadjust": "lengthAdjust",
-            "limitingconeangle":u"limitingConeAngle",
+            "limitingconeangle": "limitingConeAngle",
-            "markerheight":u"markerHeight",
+            "markerheight": "markerHeight",
-            "markerunits":u"markerUnits",
+            "markerunits": "markerUnits",
-            "markerwidth":u"markerWidth",
+            "markerwidth": "markerWidth",
-            "maskcontentunits":u"maskContentUnits",
+            "maskcontentunits": "maskContentUnits",
-            "maskunits":u"maskUnits",
+            "maskunits": "maskUnits",
-            "numoctaves":u"numOctaves",
+            "numoctaves": "numOctaves",
-            "pathlength":u"pathLength",
+            "pathlength": "pathLength",
-            "patterncontentunits":u"patternContentUnits",
+            "patterncontentunits": "patternContentUnits",
-            "patterntransform":u"patternTransform",
+            "patterntransform": "patternTransform",
-            "patternunits":u"patternUnits",
+            "patternunits": "patternUnits",
-            "pointsatx":u"pointsAtX",
+            "pointsatx": "pointsAtX",
-            "pointsaty":u"pointsAtY",
+            "pointsaty": "pointsAtY",
-            "pointsatz":u"pointsAtZ",
+            "pointsatz": "pointsAtZ",
-            "preservealpha":u"preserveAlpha",
+            "preservealpha": "preserveAlpha",
-            "preserveaspectratio":u"preserveAspectRatio",
+            "preserveaspectratio": "preserveAspectRatio",
-            "primitiveunits":u"primitiveUnits",
+            "primitiveunits": "primitiveUnits",
-            "refx":u"refX",
+            "refx": "refX",
-            "refy":u"refY",
+            "refy": "refY",
-            "repeatcount":u"repeatCount",
+            "repeatcount": "repeatCount",
-            "repeatdur":u"repeatDur",
+            "repeatdur": "repeatDur",
-            "requiredextensions":u"requiredExtensions",
+            "requiredextensions": "requiredExtensions",
-            "requiredfeatures":u"requiredFeatures",
+            "requiredfeatures": "requiredFeatures",
-            "specularconstant":u"specularConstant",
+            "specularconstant": "specularConstant",
-            "specularexponent":u"specularExponent",
+            "specularexponent": "specularExponent",
-            "spreadmethod":u"spreadMethod",
+            "spreadmethod": "spreadMethod",
-            "startoffset":u"startOffset",
+            "startoffset": "startOffset",
-            "stddeviation":u"stdDeviation",
+            "stddeviation": "stdDeviation",
-            "stitchtiles":u"stitchTiles",
+            "stitchtiles": "stitchTiles",
-            "surfacescale":u"surfaceScale",
+            "surfacescale": "surfaceScale",
-            "systemlanguage":u"systemLanguage",
+            "systemlanguage": "systemLanguage",
-            "tablevalues":u"tableValues",
+            "tablevalues": "tableValues",
-            "targetx":u"targetX",
+            "targetx": "targetX",
-            "targety":u"targetY",
+            "targety": "targetY",
-            "textlength":u"textLength",
+            "textlength": "textLength",
-            "viewbox":u"viewBox",
+            "viewbox": "viewBox",
-            "viewtarget":u"viewTarget",
+            "viewtarget": "viewTarget",
-            "xchannelselector":u"xChannelSelector",
+            "xchannelselector": "xChannelSelector",
-            "ychannelselector":u"yChannelSelector",
+            "ychannelselector": "yChannelSelector",
-            "zoomandpan":u"zoomAndPan"
+            "zoomandpan": "zoomAndPan"
        }
-        for originalName in token["data"].keys():
+        for originalName in list(token["data"].keys()):
            if originalName in replacements:
                svgName = replacements[originalName]
                token["data"][svgName] = token["data"][originalName]
                del token["data"][originalName]
    def adjustForeignAttributes(self, token):
-        replacements = {
+        replacements = adjustForeignAttributesMap
            "xlink:actuate":("xlink", "actuate", namespaces["xlink"]),
            "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]),
            "xlink:href":("xlink", "href", namespaces["xlink"]),
            "xlink:role":("xlink", "role", namespaces["xlink"]),
            "xlink:show":("xlink", "show", namespaces["xlink"]),
            "xlink:title":("xlink", "title", namespaces["xlink"]),
            "xlink:type":("xlink", "type", namespaces["xlink"]),
            "xml:base":("xml", "base", namespaces["xml"]),
            "xml:lang":("xml", "lang", namespaces["xml"]),
            "xml:space":("xml", "space", namespaces["xml"]),
            "xmlns":(None, "xmlns", namespaces["xmlns"]),
            "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"])
            }
-        for originalName in token["data"].iterkeys():
+        for originalName in token["data"].keys():
            if originalName in replacements:
                foreignName = replacements[originalName]
                token["data"][foreignName] = token["data"][originalName]
@ -431,7 +395,7 @@ class HTMLParser(object):
        """
        assert contentType in ("RAWTEXT", "RCDATA")
-        element = self.tree.insertElement(token)
+        self.tree.insertElement(token)
        if contentType == "RAWTEXT":
            self.tokenizer.state = self.tokenizer.rawtextState
@ -442,11 +406,13 @@ class HTMLParser(object):
        self.phase = self.phases["text"]
 def getPhases(debug):
    def log(function):
        """Logger that records which phase processes each token"""
        type_names = dict((value, key) for key, value in
-                          constants.tokenTypes.iteritems())
+                          constants.tokenTypes.items())
        def wrapped(self, *args, **kwargs):
            if function.__name__.startswith("process") and len(args) > 0:
                token = args[0]
@ -473,21 +439,9 @@ def getPhases(debug):
        else:
            return type
-    class Phase(object):
+    class Phase(with_metaclass(getMetaclass(debug, log))):
        """Base class for helper object that implements each phase of processing
        """
        # Order should be (they can be omitted):
        # * EOF
        # * Comment
        # * Doctype
        # * SpaceCharacters
        # * Characters
        # * StartTag
        #   - startTag* methods
        # * EndTag
        #   - endTag* methods
        __metaclass__ = getMetaclass(debug, log)
        def __init__(self, parser, tree):
            self.parser = parser
@ -514,11 +468,11 @@ def getPhases(debug):
            return self.startTagHandler[token["name"]](token)
        def startTagHtml(self, token):
-            if self.parser.firstStartTag == False and token["name"] == "html":
+            if not self.parser.firstStartTag and token["name"] == "html":
                self.parser.parseError("non-html-root")
            # XXX Need a check here to see if the first start tag token emitted is
            # this token... If it's not, invoke self.parser.parseError().
-            for attr, value in token["data"].iteritems():
+            for attr, value in token["data"].items():
                if attr not in self.tree.openElements[0].attributes:
                    self.tree.openElements[0].attributes[attr] = value
            self.parser.firstStartTag = False
@ -539,8 +493,8 @@ def getPhases(debug):
            systemId = token["systemId"]
            correct = token["correct"]
-            if (name != "html" or publicId != None or
+            if (name != "html" or publicId is not None or
-                systemId != None and systemId != "about:legacy-compat"):
+                    systemId is not None and systemId != "about:legacy-compat"):
                self.parser.parseError("unknown-doctype")
            if publicId is None:
@ -552,7 +506,7 @@ def getPhases(debug):
                publicId = publicId.translate(asciiUpper2Lower)
            if (not correct or token["name"] != "html"
-                or startswithany(publicId,
+                or publicId.startswith(
                    ("+//silmaril//dtd html pro v0r11 19970101//",
                     "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
                     "-//as//dtd html 3.0 aswedit + extensions//",
@ -612,19 +566,19 @@ def getPhases(debug):
                    ("-//w3o//dtd w3 html strict 3.0//en//",
                     "-/w3c/dtd html 4.0 transitional/en",
                     "html")
-                or startswithany(publicId,
+                or publicId.startswith(
                    ("-//w3c//dtd html 4.01 frameset//",
                     "-//w3c//dtd html 4.01 transitional//")) and
-                    systemId == None
+                    systemId is None
                    or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
                self.parser.compatMode = "quirks"
-            elif (startswithany(publicId,
+            elif (publicId.startswith(
                    ("-//w3c//dtd xhtml 1.0 frameset//",
                     "-//w3c//dtd xhtml 1.0 transitional//"))
-                  or startswithany(publicId,
+                  or publicId.startswith(
                      ("-//w3c//dtd html 4.01 frameset//",
                       "-//w3c//dtd html 4.01 transitional//")) and
-                      systemId != None):
+                  systemId is not None):
                self.parser.compatMode = "limited quirks"
            self.parser.phase = self.parser.phases["beforeHtml"]
@ -655,7 +609,6 @@ def getPhases(debug):
            self.anythingElse()
            return True
    class BeforeHtmlPhase(Phase):
        # helper methods
        def insertHtmlElement(self):
@ -691,7 +644,6 @@ def getPhases(debug):
                self.insertHtmlElement()
                return token
    class BeforeHeadPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
@ -789,7 +741,9 @@ def getPhases(debug):
            if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
                if "charset" in attributes:
                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
-                elif "content" in attributes:
+                elif ("content" in attributes and
                      "http-equiv" in attributes and
                      attributes["http-equiv"].lower() == "content-type"):
                    # Encoding it as UTF-8 here is a hack, as really we should pass
                    # the abstract Unicode string, and just use the
                    # ContentAttrParser on that, but using UTF-8 allows all chars
@ -831,12 +785,10 @@ def getPhases(debug):
        def anythingElse(self):
            self.endTagHead(impliedTagToken("head"))
    # XXX If we implement a parser for which scripting is disabled we need to
    # implement this phase.
    #
    # class InHeadNoScriptPhase(Phase):
    class AfterHeadPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
@ -904,7 +856,6 @@ def getPhases(debug):
            self.parser.phase = self.parser.phases["inBody"]
            self.parser.framesetOK = True
    class InBodyPhase(Phase):
        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
        # the really-really-really-very crazy mode
@ -923,7 +874,7 @@ def getPhases(debug):
                ("frameset", self.startTagFrameset),
                (("address", "article", "aside", "blockquote", "center", "details",
                  "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
-                  "footer", "header", "hgroup", "menu", "nav", "ol", "p",
+                  "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
                  "section", "summary", "ul"),
                 self.startTagCloseP),
                (headingElements, self.startTagHeading),
@ -963,9 +914,9 @@ def getPhases(debug):
            self.endTagHandler = utils.MethodDispatcher([
                ("body", self.endTagBody),
                ("html", self.endTagHtml),
-                (("address", "article", "aside", "blockquote", "center",
+                (("address", "article", "aside", "blockquote", "button", "center",
-                  "details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
+                  "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
-                  "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", 
+                  "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
                  "section", "summary", "ul"), self.endTagBlock),
                ("form", self.endTagForm),
                ("p", self.endTagP),
@ -1033,7 +984,7 @@ def getPhases(debug):
                self.tree.insertText(data)
        def processCharacters(self, token):
-            if token["data"] == u"\u0000":
+            if token["data"] == "\u0000":
                # The tokenizer should always emit null on its own
                return
            self.tree.reconstructActiveFormattingElements()
@ -1058,7 +1009,7 @@ def getPhases(debug):
                assert self.parser.innerHTML
            else:
                self.parser.framesetOK = False
-                for attr, value in token["data"].iteritems():
+                for attr, value in token["data"].items():
                    if attr not in self.tree.openElements[1].attributes:
                        self.tree.openElements[1].attributes[attr] = value
@ -1090,7 +1041,7 @@ def getPhases(debug):
        def startTagForm(self, token):
            if self.tree.formPointer:
-                self.parser.parseError(u"unexpected-start-tag", {"name": "form"})
+                self.parser.parseError("unexpected-start-tag", {"name": "form"})
            else:
                if self.tree.elementInScope("p", variant="button"):
                    self.endTagP(impliedTagToken("p"))
@ -1243,7 +1194,7 @@ def getPhases(debug):
            if "prompt" in token["data"]:
                prompt = token["data"]["prompt"]
            else:
-                prompt = u"This is a searchable index. Enter search keywords: "
+                prompt = "This is a searchable index. Enter search keywords: "
            self.processCharacters(
                {"type": tokenTypes["Characters"], "data": prompt})
            attributes = token["data"].copy()
@ -1436,62 +1387,102 @@ def getPhases(debug):
        def endTagFormatting(self, token):
            """The much-feared adoption agency algorithm"""
-            # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
+            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
            # XXX Better parseError messages appreciated.
            name = token["name"]
            # Step 1
            outerLoopCounter = 0
            # Step 2
            while outerLoopCounter < 8:
                # Step 3
                outerLoopCounter += 1
-                # Step 1 paragraph 1
+                # Step 4:
                # Let the formatting element be the last element in
                # the list of active formatting elements that:
                # - is between the end of the list and the last scope
                # marker in the list, if any, or the start of the list
                # otherwise, and
                # - has the same tag name as the token.
                formattingElement = self.tree.elementInActiveFormattingElements(
                    token["name"])
                if (not formattingElement or
                    (formattingElement in self.tree.openElements and
                     not self.tree.elementInScope(formattingElement.name))):
-                    self.parser.parseError("adoption-agency-1.1", {"name": token["name"]})
+                    # If there is no such node, then abort these steps
                    # and instead act as described in the "any other
                    # end tag" entry below.
                    self.endTagOther(token)
                    return
-                # Step 1 paragraph 2
+                # Otherwise, if there is such a node, but that node is
                # not in the stack of open elements, then this is a
                # parse error; remove the element from the list, and
                # abort these steps.
                elif formattingElement not in self.tree.openElements:
                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
                    self.tree.activeFormattingElements.remove(formattingElement)
                    return
-                # Step 1 paragraph 3
+                # Otherwise, if there is such a node, and that node is
                # also in the stack of open elements, but the element
                # is not in scope, then this is a parse error; ignore
                # the token, and abort these steps.
                elif not self.tree.elementInScope(formattingElement.name):
                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
                    return
                # Otherwise, there is a formatting element and that
                # element is in the stack and is in scope. If the
                # element is not the current node, this is a parse
                # error. In any case, proceed with the algorithm as
                # written in the following steps.
                else:
                    if formattingElement != self.tree.openElements[-1]:
                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
-                # Step 2
+                # Step 5:
-                # Start of the adoption agency algorithm proper
+
                # Let the furthest block be the topmost node in the
                # stack of open elements that is lower in the stack
                # than the formatting element, and is an element in
                # the special category. There might not be one.
                afeIndex = self.tree.openElements.index(formattingElement)
                furthestBlock = None
                for element in self.tree.openElements[afeIndex:]:
                    if element.nameTuple in specialElements:
                        furthestBlock = element
                        break
-                # Step 3
+
                # Step 6:
                # If there is no furthest block, then the UA must
                # first pop all the nodes from the bottom of the stack
                # of open elements, from the current node up to and
                # including the formatting element, then remove the
                # formatting element from the list of active
                # formatting elements, and finally abort these steps.
                if furthestBlock is None:
                    element = self.tree.openElements.pop()
                    while element != formattingElement:
                        element = self.tree.openElements.pop()
                    self.tree.activeFormattingElements.remove(element)
                    return
                # Step 7
                commonAncestor = self.tree.openElements[afeIndex - 1]
-                # Step 5
+                # Step 8:
                #if furthestBlock.parent:
                #    furthestBlock.parent.removeChild(furthestBlock)
                # Step 5
                # The bookmark is supposed to help us identify where to reinsert
-                # nodes in step 12. We have to ensure that we reinsert nodes after
+                # nodes in step 15. We have to ensure that we reinsert nodes after
                # the node before the active formatting element. Note the bookmark
-                # can move in step 7.4
+                # can move in step 9.7
                bookmark = self.tree.activeFormattingElements.index(formattingElement)
-                # Step 6
+                # Step 9
                lastNode = node = furthestBlock
                innerLoopCounter = 0
@ -1504,15 +1495,13 @@ def getPhases(debug):
                    if node not in self.tree.activeFormattingElements:
                        self.tree.openElements.remove(node)
                        continue
-                    # Step 6.3
+                    # Step 9.6
                    if node == formattingElement:
                        break
-                    # Step 6.4
+                    # Step 9.7
                    if lastNode == furthestBlock:
-                        bookmark = (self.tree.activeFormattingElements.index(node)
+                        bookmark = self.tree.activeFormattingElements.index(node) + 1
-                                    + 1)
+                    # Step 9.8
                    # Step 6.5
                    #cite = node.parent
                    clone = node.cloneNode()
                    # Replace node with clone
                    self.tree.activeFormattingElements[
@ -1520,20 +1509,18 @@ def getPhases(debug):
                    self.tree.openElements[
                        self.tree.openElements.index(node)] = clone
                    node = clone
-
+                    # Step 9.9
                    # Step 6.6
                    # Remove lastNode from its parents, if any
                    if lastNode.parent:
                        lastNode.parent.removeChild(lastNode)
                    node.appendChild(lastNode)
-                    # Step 7.7
+                    # Step 9.10
                    lastNode = node
                    # End of inner loop 
-                # Step 7
+                # Step 10
                # Foster parent lastNode if commonAncestor is a
-                # table, tbody, tfoot, thead, or tr we need to foster parent the 
+                # table, tbody, tfoot, thead, or tr we need to foster
-                # lastNode
+                # parent the lastNode
                if lastNode.parent:
                    lastNode.parent.removeChild(lastNode)
@ -1543,20 +1530,20 @@ def getPhases(debug):
                else:
                    commonAncestor.appendChild(lastNode)
-                # Step 8
+                # Step 11
                clone = formattingElement.cloneNode()
-                # Step 9
+                # Step 12
                furthestBlock.reparentChildren(clone)
-                # Step 10
+                # Step 13
                furthestBlock.appendChild(clone)
-                # Step 11
+                # Step 14
                self.tree.activeFormattingElements.remove(formattingElement)
                self.tree.activeFormattingElements.insert(bookmark, clone)
-                # Step 12
+                # Step 15
                self.tree.openElements.remove(formattingElement)
                self.tree.openElements.insert(
                    self.tree.openElements.index(furthestBlock) + 1, clone)
@ -1608,7 +1595,7 @@ def getPhases(debug):
        def processEOF(self):
            self.parser.parseError("expected-named-closing-tag-but-got-eof",
-                                   self.tree.openElements[-1].name)
+                                   {"name": self.tree.openElements[-1].name})
            self.tree.openElements.pop()
            self.parser.phase = self.parser.originalPhase
            return True
@ -1624,7 +1611,7 @@ def getPhases(debug):
            # document.write works
        def endTagOther(self, token):
-            node = self.tree.openElements.pop()
+            self.tree.openElements.pop()
            self.parser.phase = self.parser.originalPhase
    class InTablePhase(Phase):
@ -1798,7 +1785,7 @@ def getPhases(debug):
            return True
        def processCharacters(self, token):
-            if token["data"] == u"\u0000":
+            if token["data"] == "\u0000":
                return
            self.characterTokens.append(token)
@ -1817,7 +1804,6 @@ def getPhases(debug):
            self.parser.phase = self.originalPhase
            return token
    class InCaptionPhase(Phase):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
        def __init__(self, parser, tree):
@ -1889,7 +1875,6 @@ def getPhases(debug):
        def endTagOther(self, token):
            return self.parser.phases["inBody"].processEndTag(token)
    class InColumnGroupPhase(Phase):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-column
@ -1955,7 +1940,6 @@ def getPhases(debug):
            if not ignoreEndTag:
                return token
    class InTableBodyPhase(Phase):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
        def __init__(self, parser, tree):
@ -2054,7 +2038,6 @@ def getPhases(debug):
        def endTagOther(self, token):
            return self.parser.phases["inTable"].processEndTag(token)
    class InRowPhase(Phase):
        # http://www.whatwg.org/specs/web-apps/current-work/#in-row
        def __init__(self, parser, tree):
@ -2249,7 +2232,7 @@ def getPhases(debug):
                assert self.parser.innerHTML
        def processCharacters(self, token):
-            if token["data"] == u"\u0000":
+            if token["data"] == "\u0000":
                return
            self.tree.insertText(token["data"])
@ -2320,7 +2303,6 @@ def getPhases(debug):
            self.parser.parseError("unexpected-end-tag-in-select",
                                   {"name": token["name"]})
    class InSelectInTablePhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
@ -2360,7 +2342,6 @@ def getPhases(debug):
        def endTagOther(self, token):
            return self.parser.phases["inSelect"].processEndTag(token)
    class InForeignContentPhase(Phase):
        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
                                      "center", "code", "dd", "div", "dl", "dt",
@ -2370,53 +2351,54 @@ def getPhases(debug):
                                      "ol", "p", "pre", "ruby", "s", "small",
                                      "span", "strong", "strike", "sub", "sup",
                                      "table", "tt", "u", "ul", "var"])
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
        def adjustSVGTagNames(self, token):
-            replacements = {u"altglyph":u"altGlyph",
+            replacements = {"altglyph": "altGlyph",
-                            u"altglyphdef":u"altGlyphDef",
+                            "altglyphdef": "altGlyphDef",
-                            u"altglyphitem":u"altGlyphItem",
+                            "altglyphitem": "altGlyphItem",
-                            u"animatecolor":u"animateColor",
+                            "animatecolor": "animateColor",
-                            u"animatemotion":u"animateMotion",
+                            "animatemotion": "animateMotion",
-                            u"animatetransform":u"animateTransform",
+                            "animatetransform": "animateTransform",
-                            u"clippath":u"clipPath",
+                            "clippath": "clipPath",
-                            u"feblend":u"feBlend",
+                            "feblend": "feBlend",
-                            u"fecolormatrix":u"feColorMatrix",
+                            "fecolormatrix": "feColorMatrix",
-                            u"fecomponenttransfer":u"feComponentTransfer",
+                            "fecomponenttransfer": "feComponentTransfer",
-                            u"fecomposite":u"feComposite",
+                            "fecomposite": "feComposite",
-                            u"feconvolvematrix":u"feConvolveMatrix",
+                            "feconvolvematrix": "feConvolveMatrix",
-                            u"fediffuselighting":u"feDiffuseLighting",
+                            "fediffuselighting": "feDiffuseLighting",
-                            u"fedisplacementmap":u"feDisplacementMap",
+                            "fedisplacementmap": "feDisplacementMap",
-                            u"fedistantlight":u"feDistantLight",
+                            "fedistantlight": "feDistantLight",
-                            u"feflood":u"feFlood",
+                            "feflood": "feFlood",
-                            u"fefunca":u"feFuncA",
+                            "fefunca": "feFuncA",
-                            u"fefuncb":u"feFuncB",
+                            "fefuncb": "feFuncB",
-                            u"fefuncg":u"feFuncG",
+                            "fefuncg": "feFuncG",
-                            u"fefuncr":u"feFuncR",
+                            "fefuncr": "feFuncR",
-                            u"fegaussianblur":u"feGaussianBlur",
+                            "fegaussianblur": "feGaussianBlur",
-                            u"feimage":u"feImage",
+                            "feimage": "feImage",
-                            u"femerge":u"feMerge",
+                            "femerge": "feMerge",
-                            u"femergenode":u"feMergeNode",
+                            "femergenode": "feMergeNode",
-                            u"femorphology":u"feMorphology",
+                            "femorphology": "feMorphology",
-                            u"feoffset":u"feOffset",
+                            "feoffset": "feOffset",
-                            u"fepointlight":u"fePointLight",
+                            "fepointlight": "fePointLight",
-                            u"fespecularlighting":u"feSpecularLighting",
+                            "fespecularlighting": "feSpecularLighting",
-                            u"fespotlight":u"feSpotLight",
+                            "fespotlight": "feSpotLight",
-                            u"fetile":u"feTile",
+                            "fetile": "feTile",
-                            u"feturbulence":u"feTurbulence",
+                            "feturbulence": "feTurbulence",
-                            u"foreignobject":u"foreignObject",
+                            "foreignobject": "foreignObject",
-                            u"glyphref":u"glyphRef",
+                            "glyphref": "glyphRef",
-                            u"lineargradient":u"linearGradient",
+                            "lineargradient": "linearGradient",
-                            u"radialgradient":u"radialGradient",
+                            "radialgradient": "radialGradient",
-                            u"textpath":u"textPath"}
+                            "textpath": "textPath"}
            if token["name"] in replacements:
                token["name"] = replacements[token["name"]]
        def processCharacters(self, token):
-            if token["data"] == u"\u0000":
+            if token["data"] == "\u0000":
-                token["data"] = u"\uFFFD"
+                token["data"] = "\uFFFD"
            elif (self.parser.framesetOK and
                  any(char not in spaceCharacters for char in token["data"])):
                self.parser.framesetOK = False
@ -2428,7 +2410,7 @@ def getPhases(debug):
                (token["name"] == "font" and
                 set(token["data"].keys()) & set(["color", "face", "size"]))):
                self.parser.parseError("unexpected-html-element-in-foreign-content",
-                                       token["name"])
+                                       {"name": token["name"]})
                while (self.tree.openElements[-1].namespace !=
                       self.tree.defaultNamespace and
                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
@ -2453,7 +2435,7 @@ def getPhases(debug):
            nodeIndex = len(self.tree.openElements) - 1
            node = self.tree.openElements[-1]
            if node.name != token["name"]:
-                self.parser.parseError("unexpected-end-tag", token["name"])
+                self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
            while True:
                if node.name.translate(asciiUpper2Lower) == token["name"]:
@ -2475,7 +2457,6 @@ def getPhases(debug):
                    break
            return new_token
    class AfterBodyPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
@ -2580,7 +2561,6 @@ def getPhases(debug):
            self.parser.parseError("unexpected-end-tag-in-frameset",
                                   {"name": token["name"]})
    class AfterFramesetPhase(Phase):
        # http://www.whatwg.org/specs/web-apps/current-work/#after3
        def __init__(self, parser, tree):
@ -2618,7 +2598,6 @@ def getPhases(debug):
            self.parser.parseError("unexpected-end-tag-after-frameset",
                                   {"name": token["name"]})
    class AfterAfterBodyPhase(Phase):
        def __init__(self, parser, tree):
            Phase.__init__(self, parser, tree)
@ -2693,7 +2672,6 @@ def getPhases(debug):
            self.parser.parseError("expected-eof-but-got-end-tag",
                                   {"name": token["name"]})
    return {
        "initial": InitialPhase,
        "beforeHtml": BeforeHtmlPhase,
@ -2721,13 +2699,15 @@ def getPhases(debug):
        # XXX after after frameset
    }
 def impliedTagToken(name, type="EndTag", attributes=None,
                    selfClosing=False):
    if attributes is None:
        attributes = {}
-    return {"type":tokenTypes[type], "name":unicode(name), "data":attributes,
+    return {"type": tokenTypes[type], "name": name, "data": attributes,
            "selfClosing": selfClosing}
 class ParseError(Exception):
    """Error in parsed document"""
    pass
--- a/lib/html5lib/ihatexml.py
+++ b/lib/html5lib/ihatexml.py
@ -1,14 +1,93 @@
-import re
+from __future__ import absolute_import, division, unicode_literals
-baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+import re
 import warnings
 from .constants import DataLossWarning
 baseChar = """
 [#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] |
 [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] |
 [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] |
 [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 |
 [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] |
 [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] |
 [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] |
 [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] |
 [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 |
 [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] |
 [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] |
 [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D |
 [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] |
 [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] |
 [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] |
 [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] |
 [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] |
 [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] |
 [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 |
 [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] |
 [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] |
 [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] |
 [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] |
 [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] |
 [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] |
 [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] |
 [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] |
 [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] |
 [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] |
 [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A |
 #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 |
 #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] |
 #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] |
 [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] |
 [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C |
 #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 |
 [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] |
 [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] |
 [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 |
 [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] |
 [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B |
 #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE |
 [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] |
 [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 |
 [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] |
 [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
 ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
-combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+combiningCharacter = """
 [#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] |
 [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 |
 [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] |
 [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] |
 #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] |
 [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] |
 [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 |
 #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] |
 [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC |
 [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] |
 #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] |
 [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] |
 [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] |
 [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] |
 [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] |
 [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] |
 #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 |
 [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] |
 #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] |
 [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] |
 [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] |
 #x3099 | #x309A"""
-digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+digit = """
 [#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] |
 [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] |
 [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] |
 [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
-extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+extender = """
 #x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 |
 #[#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
 letter = " | ".join([baseChar, ideographic])
@ -20,6 +99,7 @@ nameFirst = " | ".join([letter, "_"])
 reChar = re.compile(r"#x([\d|A-F]{4,4})")
 reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
 def charStringToList(chars):
    charRanges = [item.strip() for item in chars.split(" | ")]
    rv = []
@ -40,6 +120,7 @@ def charStringToList(chars):
    rv = normaliseCharList(rv)
    return rv
 def normaliseCharList(charList):
    charList = sorted(charList)
    for item in charList:
@ -58,6 +139,7 @@ def normaliseCharList(charList):
 # We don't really support characters above the BMP :(
 max_unicode = int("FFFF", 16)
 def missingRanges(charList):
    rv = []
    if charList[0] != 0:
@ -68,42 +150,49 @@ def missingRanges(charList):
        rv.append([charList[-1][1] + 1, max_unicode])
    return rv
 def listToRegexpStr(charList):
    rv = []
    for item in charList:
        if item[0] == item[1]:
-           rv.append(escapeRegexp(unichr(item[0])))
+            rv.append(escapeRegexp(chr(item[0])))
        else:
-            rv.append(escapeRegexp(unichr(item[0])) + "-" +
+            rv.append(escapeRegexp(chr(item[0])) + "-" +
-                      escapeRegexp(unichr(item[1])))
+                      escapeRegexp(chr(item[1])))
    return "[%s]" % "".join(rv)
 def hexToInt(hex_str):
    return int(hex_str, 16)
 def escapeRegexp(string):
    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
                         "[", "]", "|", "(", ")", "-")
    for char in specialCharacters:
        string = string.replace(char, "\\" + char)
        if char in string:
            print string
    return string
 # output from the above
-nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+nonXmlNameBMPRegexp = re.compile('[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 # Simpler things
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]")
 nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
    def __init__(self, replaceChars=None,
                 dropXmlnsLocalName=False,
                 dropXmlnsAttrNs=False,
                 preventDoubleDashComments=False,
                 preventDashAtCommentEnd=False,
-                 replaceFormFeedCharacters = True):
+                 replaceFormFeedCharacters=True,
                 preventSingleQuotePubid=False):
        self.dropXmlnsLocalName = dropXmlnsLocalName
        self.dropXmlnsAttrNs = dropXmlnsAttrNs
@ -113,14 +202,17 @@ class InfosetFilter(object):
        self.replaceFormFeedCharacters = replaceFormFeedCharacters
        self.preventSingleQuotePubid = preventSingleQuotePubid
        self.replaceCache = {}
    def coerceAttribute(self, name, namespace=None):
        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
-            #Need a datalosswarning here
+            warnings.warn("Attributes cannot begin with xmlns", DataLossWarning)
            return None
        elif (self.dropXmlnsAttrNs and
              namespace == "http://www.w3.org/2000/xmlns/"):
            warnings.warn("Attributes cannot be in the xml namespace", DataLossWarning)
            return None
        else:
            return self.toXmlName(name)
@ -131,20 +223,35 @@ class InfosetFilter(object):
    def coerceComment(self, data):
        if self.preventDoubleDashComments:
            while "--" in data:
                warnings.warn("Comments cannot contain adjacent dashes", DataLossWarning)
                data = data.replace("--", "- -")
        return data
    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
            for i in range(data.count("\x0C")):
                warnings.warn("Text cannot contain U+000C", DataLossWarning)
            data = data.replace("\x0C", " ")
        # Other non-xml characters
        return data
    def coercePubid(self, data):
        dataOutput = data
        for char in nonPubidCharRegexp.findall(data):
            warnings.warn("Coercing non-XML pubid", DataLossWarning)
            replacement = self.getReplacementCharacter(char)
            dataOutput = dataOutput.replace(char, replacement)
        if self.preventSingleQuotePubid and dataOutput.find("'") >= 0:
            warnings.warn("Pubid cannot contain single quote", DataLossWarning)
            dataOutput = dataOutput.replace("'", self.getReplacementCharacter("'"))
        return dataOutput
    def toXmlName(self, name):
        nameFirst = name[0]
        nameRest = name[1:]
        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
        if m:
            warnings.warn("Coercing non-XML name", DataLossWarning)
            nameFirstOutput = self.getReplacementCharacter(nameFirst)
        else:
            nameFirstOutput = nameFirst
@ -152,6 +259,7 @@ class InfosetFilter(object):
        nameRestOutput = nameRest
        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
        for char in replaceChars:
            warnings.warn("Coercing non-XML name", DataLossWarning)
            replacement = self.getReplacementCharacter(char)
            nameRestOutput = nameRestOutput.replace(char, replacement)
        return nameFirstOutput + nameRestOutput
@ -169,9 +277,9 @@ class InfosetFilter(object):
        return name
    def escapeChar(self, char):
-        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+        replacement = "U%05X" % ord(char)
        self.replaceCache[char] = replacement
        return replacement
    def unescapeChar(self, charcode):
-        return unichr(int(charcode[1:], 16))
+        return chr(int(charcode[1:], 16))
--- a/lib/html5lib/inputstream.py
+++ b/lib/html5lib/inputstream.py
@ -1,19 +1,34 @@
 from __future__ import absolute_import, division, unicode_literals
 from six import text_type
 from six.moves import http_client
 import codecs
 import re
 import types
 import sys
-from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings, ReparseException
+from .constants import encodings, ReparseException
-import utils
+from . import utils
 from io import StringIO
 try:
    from io import BytesIO
 except ImportError:
    BytesIO = StringIO
 try:
    from io import BufferedIOBase
 except ImportError:
    class BufferedIOBase(object):
        pass
 # Non-unicode versions of constants for use in the pre-parser
-spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
+spaceCharactersBytes = frozenset([item.encode("ascii") for item in spaceCharacters])
-asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
+asciiLettersBytes = frozenset([item.encode("ascii") for item in asciiLetters])
-asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
+asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
-spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])
-invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
@ -23,12 +38,13 @@ non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                  0x10FFFE, 0x10FFFF])
-ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
 # Cache for charsUntil()
 charsUntilRegEx = {}
-class BufferedStream:
+
 class BufferedStream(object):
    """Buffering for streams that do not have buffering of their own
    The buffer is implemented as a list of chunks on the assumption that
@ -48,11 +64,11 @@ class BufferedStream:
        return pos
    def seek(self, pos):
-        assert pos < self._bufferedBytes()
+        assert pos <= self._bufferedBytes()
        offset = pos
        i = 0
        while len(self.buffer[i]) < offset:
-            offset -= pos
+            offset -= len(self.buffer[i])
            i += 1
        self.position = [i, offset]
@ -91,8 +107,7 @@ class BufferedStream:
                bytesToRead = len(bufferedData) - bufferOffset
                self.position = [bufferIndex, len(bufferedData)]
                bufferIndex += 1
-            data = rv.append(bufferedData[bufferOffset: 
+            rv.append(bufferedData[bufferOffset:bufferOffset + bytesToRead])
                                          bufferOffset + bytesToRead])
            remainingBytes -= bytesToRead
            bufferOffset = 0
@ -100,11 +115,29 @@ class BufferedStream:
        if remainingBytes:
            rv.append(self._readStream(remainingBytes))
-        return "".join(rv)
+        return b"".join(rv)
 def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
    if isinstance(source, http_client.HTTPResponse):
        # Work around Python bug #20007: read(0) closes the connection.
        # http://bugs.python.org/issue20007
        isUnicode = False
    elif hasattr(source, "read"):
        isUnicode = isinstance(source.read(0), text_type)
    else:
        isUnicode = isinstance(source, text_type)
-class HTMLInputStream:
+    if isUnicode:
        if encoding is not None:
            raise TypeError("Cannot explicitly set an encoding with a unicode string")
        return HTMLUnicodeInputStream(source)
    else:
        return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
 class HTMLUnicodeInputStream(object):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
@ -114,7 +147,7 @@ class HTMLInputStream:
    _defaultChunkSize = 10240
-    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+    def __init__(self, source):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -132,43 +165,23 @@ class HTMLInputStream:
        """
        # Craziness
-        if len(u"\U0010FFFF") == 1:
+        if len("\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
-            self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
+            self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
-            self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+            self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
        # List of where new lines occur
        self.newLines = [0]
-        self.charEncoding = (codecName(encoding), "certain")
+        self.charEncoding = ("utf-8", "certain")
-
+        self.dataStream = self.openStream(source)
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
        self.rawStream = self.openStream(source)
        # Encoding Information
        #Number of bytes to use when looking for a meta element with
        #encoding information
        self.numBytesMeta = 512
        #Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
        #Encoding to use if no other information can be found
        self.defaultEncoding = "windows-1252"
        #Detect encoding iff no explicit "transport level" encoding is supplied
        if (self.charEncoding[0] is None):
            self.charEncoding = self.detectEncoding(parseMeta, chardet)
        self.reset()
    def reset(self):
-        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+        self.chunk = ""
                                                                 'replace')
        self.chunk = u""
        self.chunkSize = 0
        self.chunkOffset = 0
        self.errors = []
@ -191,126 +204,15 @@ class HTMLInputStream:
        if hasattr(source, 'read'):
            stream = source
        else:
-            # Otherwise treat source as a string and convert to a file object
+            stream = StringIO(source)
            if isinstance(source, unicode):
                source = source.encode('utf-8')
                self.charEncoding = ("utf-8", "certain")
            try:
                from io import BytesIO
            except:
                # 2to3 converts this line to: from io import StringIO  
                from cStringIO import StringIO as BytesIO
            stream = BytesIO(source)
        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
            stream is sys.stdin):
            stream = BufferedStream(stream)
        return stream
    def detectEncoding(self, parseMeta=True, chardet=True):
        #First look for a BOM
        #This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        #If there is no BOM need to look for meta elements with encoding 
        #information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        #Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence="tentative"
            encoding = self.defaultEncoding
        #Substitute for equivalent encodings:
        encodingSub = {"iso-8859-1":"windows-1252"}
        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]
        return encoding, confidence
    def changeEncoding(self, newEncoding):
        newEncoding = codecName(newEncoding)
        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
            newEncoding = "utf-8"
        if newEncoding is None:
            return
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.reset()
            self.charEncoding = (newEncoding, "certain")
            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
        }
        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2]) # UTF-16
                seek = 2
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)
        return encoding
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        parser = EncodingParser(buffer)
        self.rawStream.seek(0)
        encoding = parser.getEncoding()
        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
            encoding = "utf-8"
        return encoding
    def _position(self, offset):
        chunk = self.chunk
-        nLines = chunk.count(u'\n', 0, offset)
+        nLines = chunk.count('\n', 0, offset)
        positionLine = self.prevNumLines + nLines
-        lastLinePos = chunk.rfind(u'\n', 0, offset)
+        lastLinePos = chunk.rfind('\n', 0, offset)
        if lastLinePos == -1:
            positionColumn = self.prevNumCols + offset
        else:
@ -343,7 +245,7 @@ class HTMLInputStream:
        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
-        self.chunk = u""
+        self.chunk = ""
        self.chunkSize = 0
        self.chunkOffset = 0
@ -367,10 +269,10 @@ class HTMLInputStream:
        # Replace invalid characters
        # Note U+0000 is dealt with in the tokenizer
-        data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
+        data = self.replaceCharactersRegexp.sub("\ufffd", data)
-        data = data.replace(u"\r\n", u"\n")
+        data = data.replace("\r\n", "\n")
-        data = data.replace(u"\r", u"\n")
+        data = data.replace("\r", "\n")
        self.chunk = data
        self.chunkSize = len(data)
@ -378,14 +280,13 @@ class HTMLInputStream:
        return True
    def characterErrorsUCS4(self, data):
-        for i in xrange(len(invalid_unicode_re.findall(data))):
+        for i in range(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")
    def characterErrorsUCS2(self, data):
        # Someone picked the wrong compile option
        # You lose
        skip = False
        import sys
        for match in invalid_unicode_re.finditer(data):
            if skip:
                continue
@ -419,10 +320,10 @@ class HTMLInputStream:
            if __debug__:
                for c in characters:
                    assert(ord(c) < 128)
-            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+            regex = "".join(["\\x%02x" % ord(c) for c in characters])
            if not opposite:
-                regex = u"^%s" % regex
+                regex = "^%s" % regex
-            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile("[%s]+" % regex)
        rv = []
@ -449,7 +350,7 @@ class HTMLInputStream:
                # Reached EOF
                break
-        r = u"".join(rv)
+        r = "".join(rv)
        return r
    def unget(self, char):
@ -468,12 +369,192 @@ class HTMLInputStream:
                self.chunkOffset -= 1
                assert self.chunk[self.chunkOffset] == char
-class EncodingBytes(str):
+
 class HTMLBinaryInputStream(HTMLUnicodeInputStream):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.
    """
    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.
        source can be either a file-object, local filename or a string.
        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        parseMeta - Look for a <meta> element containing encoding information
        """
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
        self.rawStream = self.openStream(source)
        HTMLUnicodeInputStream.__init__(self, self.rawStream)
        self.charEncoding = (codecName(encoding), "certain")
        # Encoding Information
        # Number of bytes to use when looking for a meta element with
        # encoding information
        self.numBytesMeta = 512
        # Number of bytes to use when using detecting encoding using chardet
        self.numBytesChardet = 100
        # Encoding to use if no other information can be found
        self.defaultEncoding = "windows-1252"
        # Detect encoding iff no explicit "transport level" encoding is supplied
        if (self.charEncoding[0] is None):
            self.charEncoding = self.detectEncoding(parseMeta, chardet)
        # Call superclass
        self.reset()
    def reset(self):
        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                 'replace')
        HTMLUnicodeInputStream.reset(self)
    def openStream(self, source):
        """Produces a file object from source.
        source can be either a file object, local filename or a string.
        """
        # Already a file object
        if hasattr(source, 'read'):
            stream = source
        else:
            stream = BytesIO(source)
        try:
            stream.seek(stream.tell())
        except:
            stream = BufferedStream(stream)
        return stream
    def detectEncoding(self, parseMeta=True, chardet=True):
        # First look for a BOM
        # This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        # If there is no BOM need to look for meta elements with encoding
        # information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        # Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                try:
                    from charade.universaldetector import UniversalDetector
                except ImportError:
                    from chardet.universaldetector import UniversalDetector
                buffers = []
                detector = UniversalDetector()
                while not detector.done:
                    buffer = self.rawStream.read(self.numBytesChardet)
                    assert isinstance(buffer, bytes)
                    if not buffer:
                        break
                    buffers.append(buffer)
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence = "tentative"
            encoding = self.defaultEncoding
        # Substitute for equivalent encodings:
        encodingSub = {"iso-8859-1": "windows-1252"}
        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]
        return encoding, confidence
    def changeEncoding(self, newEncoding):
        assert self.charEncoding[1] != "certain"
        newEncoding = codecName(newEncoding)
        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
            newEncoding = "utf-8"
        if newEncoding is None:
            return
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.reset()
            self.charEncoding = (newEncoding, "certain")
            raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
        }
        # Go to beginning of file and read in 4 bytes
        string = self.rawStream.read(4)
        assert isinstance(string, bytes)
        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
            # Need to detect UTF-32 before UTF-16
            encoding = bomDict.get(string)         # UTF-32
            seek = 4
            if not encoding:
                encoding = bomDict.get(string[:2])  # UTF-16
                seek = 2
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)
        return encoding
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        assert isinstance(buffer, bytes)
        parser = EncodingParser(buffer)
        self.rawStream.seek(0)
        encoding = parser.getEncoding()
        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
            encoding = "utf-8"
        return encoding
 class EncodingBytes(bytes):
    """String-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raised"""
    def __new__(self, value):
-        return str.__new__(self, value.lower())
+        assert isinstance(value, bytes)
        return bytes.__new__(self, value.lower())
    def __init__(self, value):
        self._position = -1
@ -481,13 +562,17 @@ class EncodingBytes(str):
    def __iter__(self):
        return self
-    def next(self):
+    def __next__(self):
        p = self._position = self._position + 1
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
-        return self[p]
+        return self[p:p + 1]
    def next(self):
        # Py2 compat
        return self.__next__()
    def previous(self):
        p = self._position
@ -496,7 +581,7 @@ class EncodingBytes(str):
        elif p < 0:
            raise TypeError
        self._position = p = p - 1
-        return self[p]
+        return self[p:p + 1]
    def setPosition(self, position):
        if self._position >= len(self):
@ -514,7 +599,7 @@ class EncodingBytes(str):
    position = property(getPosition, setPosition)
    def getCurrentByte(self):
-        return self[self.position]
+        return self[self.position:self.position + 1]
    currentByte = property(getCurrentByte)
@ -522,7 +607,7 @@ class EncodingBytes(str):
        """Skip past a list of characters"""
        p = self.position               # use property for the error-checking
        while p < len(self):
-            c = self[p]
+            c = self[p:p + 1]
            if c not in chars:
                self._position = p
                return c
@ -533,7 +618,7 @@ class EncodingBytes(str):
    def skipUntil(self, chars):
        p = self.position
        while p < len(self):
-            c = self[p]
+            c = self[p:p + 1]
            if c in chars:
                self._position = p
                return c
@ -565,6 +650,7 @@ class EncodingBytes(str):
        else:
            raise StopIteration
 class EncodingParser(object):
    """Mini parser for detecting character encoding from meta elements"""
@ -575,12 +661,12 @@ class EncodingParser(object):
    def getEncoding(self):
        methodDispatch = (
-            ("<!--",self.handleComment),
+            (b"<!--", self.handleComment),
-            ("<meta",self.handleMeta),
+            (b"<meta", self.handleMeta),
-            ("</",self.handlePossibleEndTag),
+            (b"</", self.handlePossibleEndTag),
-            ("<!",self.handleOther),
+            (b"<!", self.handleOther),
-            ("<?",self.handleOther),
+            (b"<?", self.handleOther),
-            ("<",self.handlePossibleStartTag))
+            (b"<", self.handlePossibleStartTag))
        for byte in self.data:
            keepParsing = True
            for key, method in methodDispatch:
@ -598,38 +684,49 @@ class EncodingParser(object):
    def handleComment(self):
        """Skip over comments"""
-        return self.data.jumpTo("-->")
+        return self.data.jumpTo(b"-->")
    def handleMeta(self):
        if self.data.currentByte not in spaceCharactersBytes:
            # if we have <meta not followed by a space so just keep going
            return True
        # We have a valid meta element we want to search for attributes
        hasPragma = False
        pendingEncoding = None
        while True:
            # Try to find the next attribute after the current position
            attr = self.getAttribute()
            if attr is None:
                return True
            else:
-                if attr[0] == "charset":
+                if attr[0] == b"http-equiv":
                    hasPragma = attr[1] == b"content-type"
                    if hasPragma and pendingEncoding is not None:
                        self.encoding = pendingEncoding
                        return False
                elif attr[0] == b"charset":
                    tentativeEncoding = attr[1]
                    codec = codecName(tentativeEncoding)
                    if codec is not None:
                        self.encoding = codec
                        return False
-                elif attr[0] == "content":
+                elif attr[0] == b"content":
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
                    if tentativeEncoding is not None:
                        codec = codecName(tentativeEncoding)
                        if codec is not None:
                            if hasPragma:
                                self.encoding = codec
                                return False
                            else:
                                pendingEncoding = codec
    def handlePossibleStartTag(self):
        return self.handlePossibleTag(False)
    def handlePossibleEndTag(self):
-        self.data.next()
+        next(self.data)
        return self.handlePossibleTag(True)
    def handlePossibleTag(self, endTag):
@ -644,7 +741,7 @@ class EncodingParser(object):
            return True
        c = data.skipUntil(spacesAngleBrackets)
-        if c == "<":
+        if c == b"<":
            # return to the first step in the overall "two step" algorithm
            # reprocessing the < byte
            data.previous()
@ -656,66 +753,66 @@ class EncodingParser(object):
        return True
    def handleOther(self):
-        return self.data.jumpTo(">")
+        return self.data.jumpTo(b">")
    def getAttribute(self):
        """Return a name,value pair for the next attribute in the stream,
        if one is found, or None"""
        data = self.data
        # Step 1 (skip chars)
-        c = data.skip(spaceCharactersBytes | frozenset("/"))
+        c = data.skip(spaceCharactersBytes | frozenset([b"/"]))
        assert c is None or len(c) == 1
        # Step 2
-        if c in (">", None):
+        if c in (b">", None):
            return None
        # Step 3
        attrName = []
        attrValue = []
        # Step 4 attribute name
        while True:
-            if c == "=" and attrName:   
+            if c == b"=" and attrName:
                break
            elif c in spaceCharactersBytes:
                # Step 6!
                c = data.skip()
                c = data.next()
                break
-            elif c in ("/", ">"):
+            elif c in (b"/", b">"):
-                return "".join(attrName), ""
+                return b"".join(attrName), b""
            elif c in asciiUppercaseBytes:
                attrName.append(c.lower())
-            elif c == None:
+            elif c is None:
                return None
            else:
                attrName.append(c)
            # Step 5
-            c = data.next()
+            c = next(data)
        # Step 7
-        if c != "=":
+        if c != b"=":
            data.previous()
-            return "".join(attrName), ""
+            return b"".join(attrName), b""
        # Step 8
-        data.next()
+        next(data)
        # Step 9
        c = data.skip()
        # Step 10
-        if c in ("'", '"'):
+        if c in (b"'", b'"'):
            # 10.1
            quoteChar = c
            while True:
                # 10.2
-                c = data.next()
+                c = next(data)
                # 10.3
                if c == quoteChar:
-                    data.next()
+                    next(data)
-                    return "".join(attrName), "".join(attrValue)
+                    return b"".join(attrName), b"".join(attrValue)
                # 10.4
                elif c in asciiUppercaseBytes:
                    attrValue.append(c.lower())
                # 10.5
                else:
                    attrValue.append(c)
-        elif c == ">":
+        elif c == b">":
-            return "".join(attrName), ""
+            return b"".join(attrName), b""
        elif c in asciiUppercaseBytes:
            attrValue.append(c.lower())
        elif c is None:
@ -724,9 +821,9 @@ class EncodingParser(object):
            attrValue.append(c)
        # Step 11
        while True:
-            c = data.next()
+            c = next(data)
            if c in spacesAngleBrackets:
-                return "".join(attrName), "".join(attrValue)
+                return b"".join(attrName), b"".join(attrValue)
            elif c in asciiUppercaseBytes:
                attrValue.append(c.lower())
            elif c is None:
@ -737,21 +834,23 @@ class EncodingParser(object):
 class ContentAttrParser(object):
    def __init__(self, data):
        assert isinstance(data, bytes)
        self.data = data
    def parse(self):
        try:
            # Check if the attr name is charset
            # otherwise return
-            self.data.jumpTo("charset")
+            self.data.jumpTo(b"charset")
            self.data.position += 1
            self.data.skip()
-            if not self.data.currentByte == "=":
+            if not self.data.currentByte == b"=":
                # If there is no = sign keep looking for attrs
                return None
            self.data.position += 1
            self.data.skip()
            # Look for an encoding between matching quote marks
-            if self.data.currentByte in ('"', "'"):
+            if self.data.currentByte in (b'"', b"'"):
                quoteMark = self.data.currentByte
                self.data.position += 1
                oldPosition = self.data.position
@ -775,7 +874,12 @@ class ContentAttrParser(object):
 def codecName(encoding):
    """Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding."""
-    if (encoding is not None and type(encoding) in types.StringTypes):
+    if isinstance(encoding, bytes):
        try:
            encoding = encoding.decode("ascii")
        except UnicodeDecodeError:
            return None
    if encoding:
        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
        return encodings.get(canonicalName, None)
    else:
--- a/lib/html5lib/sanitizer.py
+++ b/lib/html5lib/sanitizer.py
@ -1,8 +1,11 @@
 from __future__ import absolute_import, division, unicode_literals
 import re
 from xml.sax.saxutils import escape, unescape
-from tokenizer import HTMLTokenizer
+from .tokenizer import HTMLTokenizer
-from constants import tokenTypes
+from .constants import tokenTypes
 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@ -49,8 +52,8 @@ class HTMLSanitizerMixin(object):
                             'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
                             'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
                             'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
-        'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
+                             'optimum', 'pattern', 'ping', 'point-size', 'poster', 'pqg', 'preload',
-        'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
+                             'prompt', 'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
                             'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
                             'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
                             'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
@ -97,7 +100,7 @@ class HTMLSanitizerMixin(object):
                      'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
                      'y1', 'y2', 'zoomAndPan']
-    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', 'poster',
                       'xlink:href', 'xml:base']
    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
@ -160,23 +163,32 @@ class HTMLSanitizerMixin(object):
        # accommodate filters which use token_type differently
        token_type = token["type"]
-        if token_type in tokenTypes.keys():
+        if token_type in list(tokenTypes.keys()):
            token_type = tokenTypes[token_type]
        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"],
                          tokenTypes["EmptyTag"]):
            if token["name"] in self.allowed_elements:
-                if token.has_key("data"):
+                return self.allowed_token(token, token_type)
            else:
                return self.disallowed_token(token, token_type)
        elif token_type == tokenTypes["Comment"]:
            pass
        else:
            return token
    def allowed_token(self, token, token_type):
        if "data" in token:
            attrs = dict([(name, val) for name, val in
                          token["data"][::-1]
                          if name in self.allowed_attributes])
            for attr in self.attr_val_is_uri:
-                        if not attrs.has_key(attr):
+                if attr not in attrs:
                    continue
                val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                       unescape(attrs[attr])).lower()
                # remove replacement characters from unescaped characters
-                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
+                val_unescaped = val_unescaped.replace("\ufffd", "")
                if (re.match("^[a-z0-9][-+.a-z0-9]*:", val_unescaped) and
                    (val_unescaped.split(':')[0] not in
                     self.allowed_protocols)):
@ -190,11 +202,12 @@ class HTMLSanitizerMixin(object):
                'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                    attrs['xlink:href'])):
                del attrs['xlink:href']
-                    if attrs.has_key('style'):
+            if 'style' in attrs:
                attrs['style'] = self.sanitize_css(attrs['style'])
-                    token["data"] = [[name,val] for name,val in attrs.items()]
+            token["data"] = [[name, val] for name, val in list(attrs.items())]
        return token
-            else:
+
    def disallowed_token(self, token, token_type):
        if token_type == tokenTypes["EndTag"]:
            token["data"] = "</%s>" % token["name"]
        elif token["data"]:
@ -205,29 +218,28 @@ class HTMLSanitizerMixin(object):
        if token.get("selfClosing"):
            token["data"] = token["data"][:-1] + "/>"
-                if token["type"] in tokenTypes.keys():
+        if token["type"] in list(tokenTypes.keys()):
            token["type"] = "Characters"
        else:
            token["type"] = tokenTypes["Characters"]
        del token["name"]
        return token
        elif token_type == tokenTypes["Comment"]:
            pass
        else:
            return token
    def sanitize_css(self, style):
        # disallow urls
        style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
        # gauntlet
-        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
-        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
+            return ''
        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
            return ''
        clean = []
        for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style):
-          if not value: continue
+            if not value:
                continue
            if prop.lower() in self.allowed_css_properties:
                clean.append(prop + ': ' + value + ';')
            elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
@ -243,6 +255,7 @@ class HTMLSanitizerMixin(object):
        return ' '.join(clean)
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
--- a/lib/html5lib/serializer/init.py
+++ b/lib/html5lib/serializer/init.py
@ -1,17 +1,16 @@
 from __future__ import absolute_import, division, unicode_literals
-from html5lib import treewalkers
+from .. import treewalkers
-from htmlserializer import HTMLSerializer
+from .htmlserializer import HTMLSerializer
 from xhtmlserializer import XHTMLSerializer
-def serialize(input, tree="simpletree", format="html", encoding=None,
+
 def serialize(input, tree="etree", format="html", encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree)
    if format == "html":
        s = HTMLSerializer(**serializer_opts)
    elif format == "xhtml":
        s = XHTMLSerializer(**serializer_opts)
    else:
-        raise ValueError, "type must be either html or xhtml"
+        raise ValueError("type must be html")
    return s.render(walker(input), encoding)
--- a/lib/html5lib/serializer/htmlserializer.py
+++ b/lib/html5lib/serializer/htmlserializer.py
@ -1,18 +1,20 @@
-try:
+from __future__ import absolute_import, division, unicode_literals
-    frozenset
+from six import text_type
 except NameError:
    # Import from the sets module for python 2.3
    from sets import ImmutableSet as frozenset
 import gettext
 _ = gettext.gettext
-from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
+try:
-from html5lib.constants import rcdataElements, entities, xmlEntities
+    from functools import reduce
-from html5lib import utils
+except ImportError:
    pass
 from ..constants import voidElements, booleanAttributes, spaceCharacters
 from ..constants import rcdataElements, entities, xmlEntities
 from .. import utils
 from xml.sax.saxutils import escape
-spaceCharacters = u"".join(spaceCharacters)
+spaceCharacters = "".join(spaceCharacters)
 try:
    from codecs import register_error, xmlcharrefreplace_errors
@ -21,11 +23,9 @@ except ImportError:
 else:
    unicode_encode_errors = "htmlentityreplace"
    from html5lib.constants import entities
    encode_entity_map = {}
-    is_ucs4 = len(u"\U0010FFFF") == 1
+    is_ucs4 = len("\U0010FFFF") == 1
-    for k, v in entities.items():
+    for k, v in list(entities.items()):
        # skip multi-character entities
        if ((is_ucs4 and len(v) > 1) or
                (not is_ucs4 and len(v) > 2)):
@ -34,11 +34,7 @@ else:
            if len(v) == 2:
                v = utils.surrogatePairToCodepoint(v)
            else:
                try:
                v = ord(v)
                except:
                    print v
                    raise
            if not v in encode_entity_map or k.islower():
                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
                encode_entity_map[v] = k
@ -68,7 +64,7 @@ else:
                        res.append(";")
                else:
                    res.append("&#x%s;" % (hex(cp)[2:]))
-            return (u"".join(res), exc.end)
+            return ("".join(res), exc.end)
        else:
            return xmlcharrefreplace_errors(exc)
@ -81,7 +77,7 @@ class HTMLSerializer(object):
    # attribute quoting options
    quote_attr_values = False
-    quote_char = u'"'
+    quote_char = '"'
    use_best_quote_char = True
    # tag syntax options
@ -96,15 +92,17 @@ class HTMLSerializer(object):
    resolve_entities = True
    # miscellaneous options
    alphabetical_attributes = False
    inject_meta_charset = True
    strip_whitespace = False
    sanitize = False
    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
-          "minimize_boolean_attributes", "use_trailing_solidus",
+               "omit_optional_tags", "minimize_boolean_attributes",
-          "space_before_trailing_solidus", "omit_optional_tags",
+               "use_trailing_solidus", "space_before_trailing_solidus",
-          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
+               "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",
-          "escape_rcdata", "resolve_entities", "sanitize")
+               "alphabetical_attributes", "inject_meta_charset",
               "strip_whitespace", "sanitize")
    def __init__(self, **kwargs):
        """Initialize HTMLSerializer.
@ -147,10 +145,12 @@ class HTMLSerializer(object):
          See `html5lib user documentation`_
        omit_optional_tags=True|False
          Omit start/end tags that are optional.
        alphabetical_attributes=False|True
          Reorder attributes to be in alphabetical order.
        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
        """
-        if kwargs.has_key('quote_char'):
+        if 'quote_char' in kwargs:
            self.use_best_quote_char = False
        for attr in self.options:
            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
@ -158,14 +158,14 @@ class HTMLSerializer(object):
        self.strict = False
    def encode(self, string):
-        assert(isinstance(string, unicode))
+        assert(isinstance(string, text_type))
        if self.encoding:
            return string.encode(self.encoding, unicode_encode_errors)
        else:
            return string
    def encodeStrict(self, string):
-        assert(isinstance(string, unicode))
+        assert(isinstance(string, text_type))
        if self.encoding:
            return string.encode(self.encoding, "strict")
        else:
@ -175,39 +175,46 @@ class HTMLSerializer(object):
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
-            from html5lib.filters.inject_meta_charset import Filter
+            from ..filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
-        # XXX: WhitespaceFilter should be used before OptionalTagFilter
+        # WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
-            from html5lib.filters.whitespace import Filter
+            from ..filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
-            from html5lib.filters.sanitizer import Filter
+            from ..filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
-            from html5lib.filters.optionaltags import Filter
+            from ..filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        # Alphabetical attributes must be last, as other filters
        # could add attributes and alter the order
        if self.alphabetical_attributes:
            from ..filters.alphabeticalattributes import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
-                doctype = u"<!DOCTYPE %s" % token["name"]
+                doctype = "<!DOCTYPE %s" % token["name"]
                if token["publicId"]:
-                    doctype += u' PUBLIC "%s"' % token["publicId"]
+                    doctype += ' PUBLIC "%s"' % token["publicId"]
                elif token["systemId"]:
-                    doctype += u" SYSTEM"
+                    doctype += " SYSTEM"
                if token["systemId"]:
-                    if token["systemId"].find(u'"') >= 0:
+                    if token["systemId"].find('"') >= 0:
-                        if token["systemId"].find(u"'") >= 0:
+                        if token["systemId"].find("'") >= 0:
                            self.serializeError(_("System identifer contains both single and double quote characters"))
-                        quote_char = u"'"
+                        quote_char = "'"
                    else:
-                        quote_char = u'"'
+                        quote_char = '"'
-                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
+                    doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
-                doctype += u">"
+                doctype += ">"
                yield self.encodeStrict(doctype)
            elif type in ("Characters", "SpaceCharacters"):
@ -220,41 +227,41 @@ class HTMLSerializer(object):
            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
-                yield self.encodeStrict(u"<%s" % name)
+                yield self.encodeStrict("<%s" % name)
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
-                attributes = []
+                for (attr_namespace, attr_name), attr_value in token["data"].items():
                for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
                    # TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
-                    yield self.encodeStrict(u' ')
+                    yield self.encodeStrict(' ')
                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
-                      (k not in booleanAttributes.get(name, tuple()) \
+                        (k not in booleanAttributes.get(name, tuple())
                         and k not in booleanAttributes.get("", tuple())):
-                        yield self.encodeStrict(u"=")
+                        yield self.encodeStrict("=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x, y: x or (y in v),
-                                spaceCharacters + u">\"'=", False)
+                                                spaceCharacters + ">\"'=", False)
-                        v = v.replace(u"&", u"&amp;")
+                        v = v.replace("&", "&amp;")
-                        if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
+                        if self.escape_lt_in_attrs:
                            v = v.replace("<", "&lt;")
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
-                                if u"'" in v and u'"' not in v:
+                                if "'" in v and '"' not in v:
-                                    quote_char = u'"'
+                                    quote_char = '"'
-                                elif u'"' in v and u"'" not in v:
+                                elif '"' in v and "'" not in v:
-                                    quote_char = u"'"
+                                    quote_char = "'"
-                            if quote_char == u"'":
+                            if quote_char == "'":
-                                v = v.replace(u"'", u"&#39;")
+                                v = v.replace("'", "&#39;")
                            else:
-                                v = v.replace(u'"', u"&quot;")
+                                v = v.replace('"', "&quot;")
                            yield self.encodeStrict(quote_char)
                            yield self.encode(v)
                            yield self.encodeStrict(quote_char)
@ -262,10 +269,10 @@ class HTMLSerializer(object):
                            yield self.encode(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
-                        yield self.encodeStrict(u" /")
+                        yield self.encodeStrict(" /")
                    else:
-                        yield self.encodeStrict(u"/")
+                        yield self.encodeStrict("/")
-                yield self.encode(u">")
+                yield self.encode(">")
            elif type == "EndTag":
                name = token["name"]
@ -273,13 +280,13 @@ class HTMLSerializer(object):
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
-                yield self.encodeStrict(u"</%s>" % name)
+                yield self.encodeStrict("</%s>" % name)
            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError(_("Comment contains --"))
-                yield self.encodeStrict(u"<!--%s-->" % token["data"])
+                yield self.encodeStrict("<!--%s-->" % token["data"])
            elif type == "Entity":
                name = token["name"]
@ -289,7 +296,7 @@ class HTMLSerializer(object):
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
-                    data = u"&%s;" % name
+                    data = "&%s;" % name
                yield self.encodeStrict(data)
            else:
@ -297,9 +304,9 @@ class HTMLSerializer(object):
    def render(self, treewalker, encoding=None):
        if encoding:
-            return "".join(list(self.serialize(treewalker, encoding)))
+            return b"".join(list(self.serialize(treewalker, encoding)))
        else:
-            return u"".join(list(self.serialize(treewalker)))
+            return "".join(list(self.serialize(treewalker)))
    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
        # XXX The idea is to make data mandatory.
@ -307,6 +314,7 @@ class HTMLSerializer(object):
        if self.strict:
            raise SerializeError
 def SerializeError(Exception):
    """Error in serialized tree"""
    pass
--- a/lib/html5lib/serializer/xhtmlserializer.py
+++ b/lib/html5lib/serializer/xhtmlserializer.py
@ -1,9 +0,0 @@
 from htmlserializer import HTMLSerializer
 class XHTMLSerializer(HTMLSerializer):
    quote_attr_values = True
    minimize_boolean_attributes = False
    use_trailing_solidus = True
    escape_lt_in_attrs = True
    omit_optional_tags = False
    escape_rcdata = True
--- a/lib/html5lib/tests/init.py
+++ b/lib/html5lib/tests/init.py
@ -1,12 +0,0 @@
 import sys
 import os
 parent_path = os.path.abspath(os.path.join(os.path.split(__file__)[0], ".."))
 if not parent_path in sys.path:
    sys.path.insert(0, parent_path)
 del parent_path
 from runtests import buildTestSuite
 import support
--- a/lib/html5lib/tests/mockParser.py
+++ b/lib/html5lib/tests/mockParser.py
@ -1,37 +0,0 @@
 import sys
 import os
 if __name__ == '__main__':
    #Allow us to import from the src directory
    os.chdir(os.path.split(os.path.abspath(__file__))[0])
    sys.path.insert(0, os.path.abspath(os.path.join(os.pardir, "src")))
 from tokenizer import HTMLTokenizer
 class HTMLParser(object):
    """ Fake parser to test tokenizer output """
    def parse(self, stream, output=True):
        tokenizer = HTMLTokenizer(stream)
        for token in tokenizer:
            if output:
                print token
 if __name__ == "__main__":
    x = HTMLParser()
    if len(sys.argv) > 1:
        if len(sys.argv) > 2:
            import hotshot, hotshot.stats
            prof = hotshot.Profile('stats.prof')
            prof.runcall(x.parse, sys.argv[1], False)
            prof.close()
            stats = hotshot.stats.load('stats.prof')
            stats.strip_dirs()
            stats.sort_stats('time')
            stats.print_stats()
        else:
            x.parse(sys.argv[1])
    else:
        print """Usage: python mockParser.py filename [stats]
        If stats is specified the hotshots profiler will run and output the
        stats instead.
        """
--- a/lib/html5lib/tests/runparsertests.py
+++ b/lib/html5lib/tests/runparsertests.py
@ -1,27 +0,0 @@
 import sys
 import os
 import glob
 import unittest
 #Allow us to import the parent module
 os.chdir(os.path.split(os.path.abspath(__file__))[0])
 sys.path.insert(0, os.path.abspath(os.curdir))
 sys.path.insert(0, os.path.abspath(os.pardir))
 sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
 def buildTestSuite():
    suite = unittest.TestSuite()
    for testcase in glob.glob('test_*.py'):
        if testcase in ("test_tokenizer.py", "test_parser.py", "test_parser2.py"):
            module = os.path.splitext(testcase)[0]
            suite.addTest(__import__(module).buildTestSuite())
    return suite
 def main():
    results = unittest.TextTestRunner().run(buildTestSuite())
    return results
 if __name__ == "__main__":
    results = main()
    if not results.wasSuccessful():
        sys.exit(1)
--- a/lib/html5lib/tests/runtests.py
+++ b/lib/html5lib/tests/runtests.py
@ -1,20 +0,0 @@
 import sys
 import os
 import glob
 import unittest
 def buildTestSuite():
    suite = unittest.TestSuite()
    for testcase in glob.glob('test_*.py'):
        module = os.path.splitext(testcase)[0]
        suite.addTest(__import__(module).buildTestSuite())
    return suite
 def main():
    results = unittest.TextTestRunner().run(buildTestSuite())
    return results
 if __name__ == "__main__":
    results = main()
    if not results.wasSuccessful():
        sys.exit(1)
--- a/lib/html5lib/tests/support.py
+++ b/lib/html5lib/tests/support.py
@ -1,127 +0,0 @@
 import os
 import sys
 import codecs
 import glob
 base_path = os.path.split(__file__)[0]
 if os.path.exists(os.path.join(base_path, 'testdata')):
    #release
    test_dir = os.path.join(base_path, 'testdata')
 else:
    #development
    test_dir = os.path.abspath(
        os.path.join(base_path,
                     os.path.pardir, os.path.pardir,
                     os.path.pardir, 'testdata'))
    assert os.path.exists(test_dir), "Test data not found"
    #import the development html5lib
    sys.path.insert(0, os.path.abspath(os.path.join(base_path, 
                                                    os.path.pardir,
                                                    os.path.pardir)))
 import html5lib
 from html5lib import html5parser, treebuilders
 del base_path
 #Build a dict of avaliable trees
 treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"),
             "DOM":treebuilders.getTreeBuilder("dom")}
 #Try whatever etree implementations are avaliable from a list that are
 #"supposed" to work
 try:
    import xml.etree.ElementTree as ElementTree
    treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
 except ImportError:
    try:
        import elementtree.ElementTree as ElementTree
        treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
    except ImportError:
        pass
 try:
    import xml.etree.cElementTree as cElementTree
    treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
 except ImportError:
    try:
        import cElementTree
        treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
    except ImportError:
        pass
 try:
    import lxml.etree as lxml
    treeTypes['lxml'] = treebuilders.getTreeBuilder("etree", lxml, fullTree=True)
 except ImportError:
    pass
 try:
    import BeautifulSoup
    treeTypes["beautifulsoup"] = treebuilders.getTreeBuilder("beautifulsoup", fullTree=True)
 except ImportError:
    pass
 def html5lib_test_files(subdirectory, files='*.dat'):
    return glob.glob(os.path.join(test_dir,subdirectory,files))
 class DefaultDict(dict):
    def __init__(self, default, *args, **kwargs):
        self.default = default
        dict.__init__(self, *args, **kwargs)
    def __getitem__(self, key):
        return dict.get(self, key, self.default)
 class TestData(object):
    def __init__(self, filename, newTestHeading="data"):
        self.f = codecs.open(filename, encoding="utf8")
        self.newTestHeading = newTestHeading
    def __iter__(self):
        data = DefaultDict(None)
        key=None
        for line in self.f:
            heading = self.isSectionHeading(line)
            if heading:
                if data and heading == self.newTestHeading:
                    #Remove trailing newline
                    data[key] = data[key][:-1]
                    yield self.normaliseOutput(data)
                    data = DefaultDict(None)
                key = heading
                data[key]=""
            elif key is not None:
                data[key] += line
        if data:
            yield self.normaliseOutput(data)
    def isSectionHeading(self, line):
        """If the current heading is a test section heading return the heading,
        otherwise return False"""
        if line.startswith("#"):
            return line[1:].strip()
        else:
            return False
    def normaliseOutput(self, data):
        #Remove trailing newlines
        for key,value in data.iteritems():
            if value.endswith("\n"):
                data[key] = value[:-1]
        return data
 def convert(stripChars):
    def convertData(data):
        """convert the output of str(document) to the format used in the testcases"""
        data = data.split("\n")
        rv = []
        for line in data:
            if line.startswith("|"):
                rv.append(line[stripChars:])
            else:
                rv.append(line)
        return "\n".join(rv)
    return convertData
 convertExpected = convert(2)
--- a/lib/html5lib/tests/test_encoding.py
+++ b/lib/html5lib/tests/test_encoding.py
@ -1,54 +0,0 @@
 import os
 import unittest
 from support import html5lib_test_files, TestData, test_dir
 from html5lib import HTMLParser, inputstream
 import re, unittest
 class Html5EncodingTestCase(unittest.TestCase):
    def test_codec_name(self):
        self.assertEquals(inputstream.codecName("utf-8"), "utf-8")
        self.assertEquals(inputstream.codecName("utf8"), "utf-8")
        self.assertEquals(inputstream.codecName("  utf8  "), "utf-8")
        self.assertEquals(inputstream.codecName("ISO_8859--1"), "windows-1252")
 def buildTestSuite():
    for filename in html5lib_test_files("encoding"):
        test_name = os.path.basename(filename).replace('.dat',''). \
            replace('-','')
        tests = TestData(filename, "data")
        for idx, test in enumerate(tests):
            def encodingTest(self, data=test['data'], 
                             encoding=test['encoding']):
                p = HTMLParser()
                t = p.parse(data, useChardet=False)
                errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
                                (data, repr(encoding.lower()), 
                                 repr(p.tokenizer.stream.charEncoding)))
                self.assertEquals(encoding.lower(),
                                  p.tokenizer.stream.charEncoding[0], 
                                  errorMessage)
            setattr(Html5EncodingTestCase, 'test_%s_%d' % (test_name, idx+1),
                encodingTest)
    try:
        import chardet
        def test_chardet(self):
            data = open(os.path.join(test_dir, "encoding" , "chardet", "test_big5.txt")).read()
            encoding = inputstream.HTMLInputStream(data).charEncoding
            assert encoding[0].lower() == "big5"
        setattr(Html5EncodingTestCase, 'test_chardet', test_chardet)
    except ImportError:
        print "chardet not found, skipping chardet tests"
    return unittest.defaultTestLoader.loadTestsFromName(__name__)
 def main():
    buildTestSuite()
    unittest.main()
 if __name__ == "__main__":
    main()
--- a/lib/html5lib/tests/test_formfiller.py
+++ b/lib/html5lib/tests/test_formfiller.py
@ -1,296 +0,0 @@
 import sys
 import unittest
 from html5lib.filters.formfiller import SimpleFilter
 class FieldStorage(dict):
    def getlist(self, name):
        l = self[name]
        if isinstance(l, list):
            return l
        elif isinstance(l, tuple) or hasattr(l, '__iter__'):
            return list(l)
        return [l]
 class TestCase(unittest.TestCase):
    def runTest(self, input, formdata, expected):
        try:
            output = list(SimpleFilter(input, formdata))
        except NotImplementedError, nie:
            # Amnesty for those that confess...
            print >>sys.stderr, "Not implemented:", str(nie)
        else:
            errorMsg = "\n".join(["\n\nInput:", str(input),
                                  "\nForm data:", str(formdata),
                                  "\nExpected:", str(expected),
                                  "\nReceived:", str(output)])
            self.assertEquals(output, expected, errorMsg)
    def testSingleTextInputWithValue(self):
        self.runTest(
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"text"), (u"name", u"foo"), (u"value", u"quux")]}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"text"), (u"name", u"foo"), (u"value", u"bar")]}])
    def testSingleTextInputWithoutValue(self):
        self.runTest(
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"text"), (u"name", u"foo")]}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"text"), (u"name", u"foo"), (u"value", u"bar")]}])
    def testSingleCheckbox(self):
        self.runTest(
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar")]}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar"), (u"checked", u"")]}])
    def testSingleCheckboxShouldBeUnchecked(self):
        self.runTest(
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux")]}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux")]}])
    def testSingleCheckboxCheckedByDefault(self):
        self.runTest(
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar"), (u"checked", u"")]}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"bar"), (u"checked", u"")]}])
    def testSingleCheckboxCheckedByDefaultShouldBeUnchecked(self):
        self.runTest(
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux"), (u"checked", u"")]}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"EmptyTag", "name": u"input",
                "data": [(u"type", u"checkbox"), (u"name", u"foo"), (u"value", u"quux")]}])
    def testSingleTextareaWithValue(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"textarea", "data": []}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"textarea", "data": []}])
    def testSingleTextareaWithoutValue(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
             {"type": u"EndTag", "name": u"textarea", "data": []}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"StartTag", "name": u"textarea", "data": [(u"name", u"foo")]},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"textarea", "data": []}])
    def testSingleSelectWithValue(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectWithValueShouldBeUnselected(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "quux"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectWithoutValue(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"selected", u"")]},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectWithoutValueShouldBeUnselected(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "quux"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectTwoOptionsWithValue(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectTwoOptionsWithValueShouldBeUnselected(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"baz")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "quux"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"baz")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectTwoOptionsWithoutValue(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "bar"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"selected", u"")]},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectTwoOptionsWithoutValueShouldBeUnselected(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"baz"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": "quux"}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"bar"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": []},
             {"type": u"Characters", "data": u"baz"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testSingleSelectMultiple(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo"), (u"multiple", u"")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": ["bar", "quux"]}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo"), (u"multiple", u"")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux"), (u"selected", u"")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
    def testTwoSelect(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []},
             {"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}],
            FieldStorage({"foo": ["bar", "quux"]}),
            [{"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar"), (u"selected", u"")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []},
             {"type": u"StartTag", "name": u"select", "data": [(u"name", u"foo")]},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"bar")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"StartTag", "name": u"option", "data": [(u"value", u"quux"), (u"selected", u"")]},
             {"type": u"Characters", "data": u"quux"},
             {"type": u"EndTag", "name": u"option", "data": []},
             {"type": u"EndTag", "name": u"select", "data": []}])
 def buildTestSuite():
    return unittest.defaultTestLoader.loadTestsFromName(__name__)
 def main():
    buildTestSuite()
    unittest.main()
 if __name__ == "__main__":
    main()
--- a/lib/html5lib/tests/test_parser.py
+++ b/lib/html5lib/tests/test_parser.py
@ -1,140 +0,0 @@
 import os
 import sys
 import traceback
 import StringIO
 import warnings
 import re
 warnings.simplefilter("error")
 from support import html5lib_test_files as data_files
 from support import TestData, convert, convertExpected
 import html5lib
 from html5lib import html5parser, treebuilders, constants
 treeTypes = {"simpletree":treebuilders.getTreeBuilder("simpletree"),
             "DOM":treebuilders.getTreeBuilder("dom")}
 #Try whatever etree implementations are avaliable from a list that are
 #"supposed" to work
 try:
    import xml.etree.ElementTree as ElementTree
    treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
 except ImportError:
    try:
        import elementtree.ElementTree as ElementTree
        treeTypes['ElementTree'] = treebuilders.getTreeBuilder("etree", ElementTree, fullTree=True)
    except ImportError:
        pass
 try:
    import xml.etree.cElementTree as cElementTree
    treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
 except ImportError:
    try:
        import cElementTree
        treeTypes['cElementTree'] = treebuilders.getTreeBuilder("etree", cElementTree, fullTree=True)
    except ImportError:
        pass
 try:
    try:
        import lxml.html as lxml
    except ImportError:
        import lxml.etree as lxml
    treeTypes['lxml'] = treebuilders.getTreeBuilder("lxml", lxml, fullTree=True)
 except ImportError:
    pass
 try:
    import BeautifulSoup
    treeTypes["beautifulsoup"] = treebuilders.getTreeBuilder("beautifulsoup", fullTree=True)
 except ImportError:
    pass
 #Try whatever dom implementations are avaliable from a list that are
 #"supposed" to work
 try:
    import pxdom
    treeTypes["pxdom"] = treebuilders.getTreeBuilder("dom", pxdom)
 except ImportError:
    pass
 #Run the parse error checks
 checkParseErrors = False
 #XXX - There should just be one function here but for some reason the testcase
 #format differs from the treedump format by a single space character
 def convertTreeDump(data):
    return "\n".join(convert(3)(data).split("\n")[1:])
 namespaceExpected = re.compile(r"^(\s*)<(\S+)>", re.M).sub
 def runParserTest(innerHTML, input, expected, errors, treeClass,
                  namespaceHTMLElements):
    #XXX - move this out into the setup function
    #concatenate all consecutive character tokens into a single token
    try:
        p = html5parser.HTMLParser(tree = treeClass,
                                   namespaceHTMLElements=namespaceHTMLElements)
    except constants.DataLossWarning:
        return
    try:
        if innerHTML:
            document = p.parseFragment(input, innerHTML)
        else:
            try:
                document = p.parse(input)
            except constants.DataLossWarning:
                return 
    except:
        errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
                               u"\nTraceback:", traceback.format_exc()])
        assert False, errorMsg.encode("utf8")
    output = convertTreeDump(p.tree.testSerializer(document))
    expected = convertExpected(expected)
    if namespaceHTMLElements:
        expected = namespaceExpected(r"\1<html \2>", expected)
    errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
                           u"\nReceived:", output])
    assert expected == output, errorMsg.encode("utf8")
    errStr = [u"Line: %i Col: %i %s"%(line, col, 
                                      constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
              ((line,col), errorcode, datavars) in p.errors]
    errorMsg2 = u"\n".join([u"\n\nInput:", input,
                            u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors),
                            u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
    if checkParseErrors:
            assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
 def test_parser():
    sys.stderr.write('Testing tree builders '+ " ".join(treeTypes.keys()) + "\n")
    files = data_files('tree-construction')
    for filename in files:
        testName = os.path.basename(filename).replace(".dat","")
        tests = TestData(filename, "data")
        for index, test in enumerate(tests):
            input, errors, innerHTML, expected = [test[key] for key in
                                                      'data', 'errors',
                                                      'document-fragment',
                                                      'document']
            if errors:
                errors = errors.split("\n")
            for treeName, treeCls in treeTypes.iteritems():
                for namespaceHTMLElements in (True, False):
                    print input
                    yield (runParserTest, innerHTML, input, expected, errors, treeCls,
                           namespaceHTMLElements)
                    break
--- a/lib/html5lib/tests/test_parser2.py
+++ b/lib/html5lib/tests/test_parser2.py
@ -1,39 +0,0 @@
 import support
 from html5lib import html5parser
 from html5lib.constants import namespaces
 from html5lib.treebuilders import dom
 import unittest
 # tests that aren't autogenerated from text files
 class MoreParserTests(unittest.TestCase):
  def test_assertDoctypeCloneable(self):
    parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
    doc = parser.parse('<!DOCTYPE HTML>')
    self.assert_(doc.cloneNode(True))
  def test_line_counter(self):
    # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
    parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
    parser.parse("<pre>\nx\n&gt;\n</pre>")
  def test_namespace_html_elements_0(self): 
    parser = html5parser.HTMLParser(namespaceHTMLElements=True)
    doc = parser.parse("<html></html>")
    self.assert_(doc.childNodes[0].namespace == namespaces["html"])
  def test_namespace_html_elements_1(self): 
    parser = html5parser.HTMLParser(namespaceHTMLElements=False)
    doc = parser.parse("<html></html>")
    self.assert_(doc.childNodes[0].namespace == None)
 def buildTestSuite():
  return unittest.defaultTestLoader.loadTestsFromName(__name__)
 def main():
    buildTestSuite()
    unittest.main()
 if __name__ == '__main__':
    main()
--- a/lib/html5lib/tests/test_sanitizer.py
+++ b/lib/html5lib/tests/test_sanitizer.py
@ -1,76 +0,0 @@
 import os
 import sys
 import unittest
 try:
    import json
 except ImportError:
    import simplejson as json
 from html5lib import html5parser, sanitizer, constants
 def runSanitizerTest(name, expected, input):
    expected = ''.join([token.toxml() for token in html5parser.HTMLParser().
                         parseFragment(expected).childNodes])
    expected = json.loads(json.dumps(expected))
    assert expected == sanitize_html(input)
 def sanitize_html(stream):
    return ''.join([token.toxml() for token in
                    html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
                     parseFragment(stream).childNodes])
 def test_should_handle_astral_plane_characters():
    assert u"<p>\U0001d4b5 \U0001d538</p>" == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
 def test_sanitizer():
    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
        if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
            continue ### TODO
        if tag_name != tag_name.lower():
            continue ### TODO
        if tag_name == 'image':
            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
              "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
              "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
        elif tag_name == 'br':
            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
              "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
              "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
        elif tag_name in constants.voidElements:
            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
              "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
              "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
        else:
            yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
              "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name,tag_name),
              "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
        tag_name = tag_name.upper()
        yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
          "&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name,tag_name),
          "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name,tag_name))
    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
        if attribute_name != attribute_name.lower(): continue ### TODO
        if attribute_name == 'style': continue
        yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
          "<p %s=\"foo\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % attribute_name,
          "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name)
    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
        attribute_name = attribute_name.upper()
        yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
          "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
          "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name)
    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
        yield (runSanitizerTest, "test_should_allow_%s_uris" % protocol,
          "<a href=\"%s\">foo</a>" % protocol,
          """<a href="%s">foo</a>""" % protocol)
    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
        yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
          "<a href=\"%s\">foo</a>" % protocol,
        """<a href="%s">foo</a>""" % protocol)
--- a/lib/html5lib/tests/test_serializer.py
+++ b/lib/html5lib/tests/test_serializer.py
@ -1,180 +0,0 @@
 import os
 import unittest
 from support import html5lib_test_files
 try:
    import json
 except ImportError:
    import simplejson as json
 import html5lib
 from html5lib import html5parser, serializer, constants
 from html5lib.treewalkers._base import TreeWalker
 optionals_loaded = []
 try:
    from lxml import etree
    optionals_loaded.append("lxml")
 except ImportError:
    pass
 default_namespace = constants.namespaces["html"]
 class JsonWalker(TreeWalker):
    def __iter__(self):
        for token in self.tree:
            type = token[0]
            if type == "StartTag":
                if len(token) == 4:
                    namespace, name, attrib = token[1:4]
                else:
                    namespace = default_namespace
                    name, attrib = token[1:3]
                yield self.startTag(namespace, name, self._convertAttrib(attrib))
            elif type == "EndTag":
                if len(token) == 3:
                    namespace, name = token[1:3]
                else:
                    namespace = default_namespace
                    name = token[1]
                yield self.endTag(namespace, name)
            elif type == "EmptyTag":
                if len(token) == 4:
                    namespace, name, attrib = token[1:]
                else:
                    namespace = default_namespace
                    name, attrib = token[1:]
                for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
                    yield token
            elif type == "Comment":
                yield self.comment(token[1])
            elif type in ("Characters", "SpaceCharacters"):
                for token in self.text(token[1]):
                    yield token
            elif type == "Doctype":
                if len(token) == 4:
                    yield self.doctype(token[1], token[2], token[3])
                elif len(token) == 3:
                    yield self.doctype(token[1], token[2])
                else:
                    yield self.doctype(token[1])
            else:
                raise ValueError("Unknown token type: " + type)
    def _convertAttrib(self, attribs):
        """html5lib tree-walkers use a dict of (namespace, name): value for
        attributes, but JSON cannot represent this. Convert from the format
        in the serializer tests (a list of dicts with "namespace", "name",
        and "value" as keys) to html5lib's tree-walker format."""
        attrs = {}
        for attrib in attribs:
            name = (attrib["namespace"], attrib["name"])
            assert(name not in attrs)
            attrs[name] = attrib["value"]
        return attrs
 def serialize_html(input, options):
    options = dict([(str(k),v) for k,v in options.iteritems()])
    return serializer.HTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
 def serialize_xhtml(input, options):
    options = dict([(str(k),v) for k,v in options.iteritems()])
    return serializer.XHTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
 def make_test(input, expected, xhtml, options):
    result = serialize_html(input, options)
    if len(expected) == 1:
        assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:False\n%s"%(expected[0], result, str(options))
    elif result not in expected:
        assert False, "Expected: %s, Received: %s" % (expected, result)
    if not xhtml:
        return
    result = serialize_xhtml(input, options)
    if len(xhtml) == 1:
        assert xhtml[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:True\n%s"%(xhtml[0], result, str(options))
    elif result not in xhtml:
        assert False, "Expected: %s, Received: %s" % (xhtml, result)
 class EncodingTestCase(unittest.TestCase):
    def throwsWithLatin1(self, input):
        self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
    def testDoctypeName(self):
        self.throwsWithLatin1([["Doctype", u"\u0101"]])
    def testDoctypePublicId(self):
        self.throwsWithLatin1([["Doctype", u"potato", u"\u0101"]])
    def testDoctypeSystemId(self):
        self.throwsWithLatin1([["Doctype", u"potato", u"potato", u"\u0101"]])
    def testCdataCharacters(self):
        self.assertEquals("<style>&amacr;", serialize_html([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}],
                                                            ["Characters", u"\u0101"]],
                                                           {"encoding": "iso-8859-1"}))
    def testCharacters(self):
        self.assertEquals("&amacr;", serialize_html([["Characters", u"\u0101"]],
                                                    {"encoding": "iso-8859-1"}))
    def testStartTagName(self):
        self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
    def testEmptyTagName(self):
        self.throwsWithLatin1([["EmptyTag", u"http://www.w3.org/1999/xhtml", u"\u0101", []]])
    def testAttributeName(self):
        self.throwsWithLatin1([["StartTag", u"http://www.w3.org/1999/xhtml", u"span", [{"namespace": None, "name": u"\u0101", "value": u"potato"}]]])
    def testAttributeValue(self):
        self.assertEquals("<span potato=&amacr;>", serialize_html([["StartTag", u"http://www.w3.org/1999/xhtml", u"span",
                                                                    [{"namespace": None, "name": u"potato", "value": u"\u0101"}]]],
                                                                  {"encoding": "iso-8859-1"}))
    def testEndTagName(self):
        self.throwsWithLatin1([["EndTag", u"http://www.w3.org/1999/xhtml", u"\u0101"]])
    def testComment(self):
        self.throwsWithLatin1([["Comment", u"\u0101"]])
 if "lxml" in optionals_loaded:
    class LxmlTestCase(unittest.TestCase):
        def setUp(self):
            self.parser = etree.XMLParser(resolve_entities=False)
            self.treewalker = html5lib.getTreeWalker("lxml")
            self.serializer = serializer.HTMLSerializer()
        def testEntityReplacement(self):
            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
            tree = etree.fromstring(doc, parser = self.parser).getroottree()
            result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>""", result)
        def testEntityXML(self):
            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>"""
            tree = etree.fromstring(doc, parser = self.parser).getroottree()
            result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False)
            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&gt;</html>""", result)
        def testEntityNoResolve(self):
            doc = """<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>"""
            tree = etree.fromstring(doc, parser = self.parser).getroottree()
            result = serializer.serialize(tree, tree="lxml", omit_optional_tags=False,
                                          resolve_entities=False)
            self.assertEquals(u"""<!DOCTYPE html SYSTEM "about:legacy-compat"><html>&beta;</html>""", result)
 def test_serializer():
    for filename in html5lib_test_files('serializer', '*.test'):
        tests = json.load(file(filename))
        test_name = os.path.basename(filename).replace('.test','')
        for index, test in enumerate(tests['tests']):
            xhtml = test.get("xhtml", test["expected"])
            if test_name == 'optionaltags': 
                xhtml = None
            yield make_test, test["input"], test["expected"], xhtml, test.get("options", {})
--- a/lib/html5lib/tests/test_stream.py
+++ b/lib/html5lib/tests/test_stream.py
@ -1,97 +0,0 @@
 import support
 import unittest, codecs
 from html5lib.inputstream import HTMLInputStream
 class HTMLInputStreamShortChunk(HTMLInputStream):
    _defaultChunkSize = 2
 class HTMLInputStreamTest(unittest.TestCase):
    def test_char_ascii(self):
        stream = HTMLInputStream("'", encoding='ascii')
        self.assertEquals(stream.charEncoding[0], 'ascii')
        self.assertEquals(stream.char(), "'")
    def test_char_null(self):
        stream = HTMLInputStream("\x00")
        self.assertEquals(stream.char(), u'\ufffd')
    def test_char_utf8(self):
        stream = HTMLInputStream(u'\u2018'.encode('utf-8'), encoding='utf-8')
        self.assertEquals(stream.charEncoding[0], 'utf-8')
        self.assertEquals(stream.char(), u'\u2018')
    def test_char_win1252(self):
        stream = HTMLInputStream(u"\xa9\xf1\u2019".encode('windows-1252'))
        self.assertEquals(stream.charEncoding[0], 'windows-1252')
        self.assertEquals(stream.char(), u"\xa9")
        self.assertEquals(stream.char(), u"\xf1")
        self.assertEquals(stream.char(), u"\u2019")
    def test_bom(self):
        stream = HTMLInputStream(codecs.BOM_UTF8 + "'")
        self.assertEquals(stream.charEncoding[0], 'utf-8')
        self.assertEquals(stream.char(), "'")
    def test_utf_16(self):
        stream = HTMLInputStream((' '*1025).encode('utf-16'))
        self.assert_(stream.charEncoding[0] in ['utf-16-le', 'utf-16-be'], stream.charEncoding)
        self.assertEquals(len(stream.charsUntil(' ', True)), 1025)
    def test_newlines(self):
        stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\r\nccc\rddddxe")
        self.assertEquals(stream.position(), (1, 0))
        self.assertEquals(stream.charsUntil('c'), u"a\nbb\n")
        self.assertEquals(stream.position(), (3, 0))
        self.assertEquals(stream.charsUntil('x'), u"ccc\ndddd")
        self.assertEquals(stream.position(), (4, 4))
        self.assertEquals(stream.charsUntil('e'), u"x")
        self.assertEquals(stream.position(), (4, 5))
    def test_newlines2(self):
        size = HTMLInputStream._defaultChunkSize
        stream = HTMLInputStream("\r" * size + "\n")
        self.assertEquals(stream.charsUntil('x'), "\n" * size)
    def test_position(self):
        stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\nccc\nddde\nf\ngh")
        self.assertEquals(stream.position(), (1, 0))
        self.assertEquals(stream.charsUntil('c'), u"a\nbb\n")
        self.assertEquals(stream.position(), (3, 0))
        stream.unget(u"\n")
        self.assertEquals(stream.position(), (2, 2))
        self.assertEquals(stream.charsUntil('c'), u"\n")
        self.assertEquals(stream.position(), (3, 0))
        stream.unget(u"\n")
        self.assertEquals(stream.position(), (2, 2))
        self.assertEquals(stream.char(), u"\n")
        self.assertEquals(stream.position(), (3, 0))
        self.assertEquals(stream.charsUntil('e'), u"ccc\nddd")
        self.assertEquals(stream.position(), (4, 3))
        self.assertEquals(stream.charsUntil('h'), u"e\nf\ng")
        self.assertEquals(stream.position(), (6, 1))
    def test_position2(self):
        stream = HTMLInputStreamShortChunk("abc\nd")
        self.assertEquals(stream.position(), (1, 0))
        self.assertEquals(stream.char(), u"a")
        self.assertEquals(stream.position(), (1, 1))
        self.assertEquals(stream.char(), u"b")
        self.assertEquals(stream.position(), (1, 2))
        self.assertEquals(stream.char(), u"c")
        self.assertEquals(stream.position(), (1, 3))
        self.assertEquals(stream.char(), u"\n")
        self.assertEquals(stream.position(), (2, 0))
        self.assertEquals(stream.char(), u"d")
        self.assertEquals(stream.position(), (2, 1))
 def buildTestSuite():
    return unittest.defaultTestLoader.loadTestsFromName(__name__)
 def main():
    buildTestSuite()
    unittest.main()
 if __name__ == '__main__':
    main()
--- a/lib/html5lib/tests/test_tokenizer.py
+++ b/lib/html5lib/tests/test_tokenizer.py
@ -1,193 +0,0 @@
 import sys
 import os
 import unittest
 import cStringIO
 import warnings
 import re
 try:
    import json
 except ImportError:
    import simplejson as json
 from support import html5lib_test_files
 from html5lib.tokenizer import HTMLTokenizer
 from html5lib import constants
 class TokenizerTestParser(object):
    def __init__(self, initialState, lastStartTag=None):
        self.tokenizer = HTMLTokenizer
        self._state = initialState
        self._lastStartTag = lastStartTag
    def parse(self, stream, encoding=None, innerHTML=False):
        tokenizer = self.tokenizer(stream, encoding)
        self.outputTokens = []
        tokenizer.state = getattr(tokenizer, self._state)
        if self._lastStartTag is not None:
            tokenizer.currentToken = {"type": "startTag", 
                                      "name":self._lastStartTag}
        types = dict((v,k) for k,v in constants.tokenTypes.iteritems())
        for token in tokenizer:
            getattr(self, 'process%s' % types[token["type"]])(token)
        return self.outputTokens
    def processDoctype(self, token):
        self.outputTokens.append([u"DOCTYPE", token["name"], token["publicId"],
                                  token["systemId"], token["correct"]])
    def processStartTag(self, token):
        self.outputTokens.append([u"StartTag", token["name"], 
                                  dict(token["data"][::-1]), token["selfClosing"]])
    def processEmptyTag(self, token):
        if token["name"] not in constants.voidElements:
            self.outputTokens.append(u"ParseError")
        self.outputTokens.append([u"StartTag", token["name"], dict(token["data"][::-1])])
    def processEndTag(self, token):
        self.outputTokens.append([u"EndTag", token["name"], 
                                  token["selfClosing"]])
    def processComment(self, token):
        self.outputTokens.append([u"Comment", token["data"]])
    def processSpaceCharacters(self, token):
        self.outputTokens.append([u"Character", token["data"]])
        self.processSpaceCharacters = self.processCharacters
    def processCharacters(self, token):
        self.outputTokens.append([u"Character", token["data"]])
    def processEOF(self, token):
        pass
    def processParseError(self, token):
        self.outputTokens.append([u"ParseError", token["data"]])
 def concatenateCharacterTokens(tokens):
    outputTokens = []
    for token in tokens:
        if not "ParseError" in token and token[0] == "Character":
            if (outputTokens and not "ParseError" in outputTokens[-1] and
                outputTokens[-1][0] == "Character"):
                outputTokens[-1][1] += token[1]
            else:
                outputTokens.append(token)
        else:
            outputTokens.append(token)
    return outputTokens
 def normalizeTokens(tokens):
    # TODO: convert tests to reflect arrays
    for i, token in enumerate(tokens):
        if token[0] == u'ParseError':
            tokens[i] = token[0]
    return tokens
 def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
                ignoreErrors=False):
    """Test whether the test has passed or failed
    If the ignoreErrorOrder flag is set to true we don't test the relative
    positions of parse errors and non parse errors
    """
    checkSelfClosing= False
    for token in expectedTokens:
        if (token[0] == "StartTag" and len(token) == 4
            or token[0] == "EndTag" and len(token) == 3):
            checkSelfClosing = True
            break
    if not checkSelfClosing:
        for token in receivedTokens:
            if token[0] == "StartTag" or token[0] == "EndTag":
                token.pop()
    if not ignoreErrorOrder and not ignoreErrors:
        return expectedTokens == receivedTokens
    else:
        #Sort the tokens into two groups; non-parse errors and parse errors
        tokens = {"expected":[[],[]], "received":[[],[]]}
        for tokenType, tokenList in zip(tokens.keys(),
                                         (expectedTokens, receivedTokens)):
            for token in tokenList:
                if token != "ParseError":
                    tokens[tokenType][0].append(token)
                else:
                    if not ignoreErrors:
                        tokens[tokenType][1].append(token)
        return tokens["expected"] == tokens["received"]
 def unescape_test(test):
    def decode(inp):
        return inp.decode("unicode-escape")
    test["input"] = decode(test["input"])
    for token in test["output"]:
        if token == "ParseError":
            continue
        else:
            token[1] = decode(token[1])
            if len(token) > 2:
                for key, value in token[2]:
                    del token[2][key]
                    token[2][decode(key)] = decode(value)
    return test
 def runTokenizerTest(test):
    #XXX - move this out into the setup function
    #concatenate all consecutive character tokens into a single token
    if 'doubleEscaped' in test:
        test = unescape_test(test)
    expected = concatenateCharacterTokens(test['output'])            
    if 'lastStartTag' not in test:
        test['lastStartTag'] = None
    outBuffer = cStringIO.StringIO()
    stdout = sys.stdout
    sys.stdout = outBuffer
    parser = TokenizerTestParser(test['initialState'], 
                                 test['lastStartTag'])
    tokens = parser.parse(test['input'])
    tokens = concatenateCharacterTokens(tokens)
    received = normalizeTokens(tokens)
    errorMsg = u"\n".join(["\n\nInitial state:",
                          test['initialState'] ,
                          "\nInput:", unicode(test['input']),
                          "\nExpected:", unicode(expected),
                          "\nreceived:", unicode(tokens)])
    errorMsg = errorMsg.encode("utf-8")
    ignoreErrorOrder = test.get('ignoreErrorOrder', False)
    assert tokensMatch(expected, received, ignoreErrorOrder), errorMsg
 def _doCapitalize(match):
    return match.group(1).upper()
 _capitalizeRe = re.compile(r"\W+(\w)").sub
 def capitalize(s):
    s = s.lower()
    s = _capitalizeRe(_doCapitalize, s)
    return s
 def test_tokenizer():
    for filename in html5lib_test_files('tokenizer', '*.test'):
        tests = json.load(file(filename))
        testName = os.path.basename(filename).replace(".test","")
        if 'tests' in tests:
            for index,test in enumerate(tests['tests']):
                #Skip tests with a self closing flag
                skip = False
                if 'initialStates' not in test:
                    test["initialStates"] = ["Data state"]
                for initialState in test["initialStates"]:
                    test["initialState"] = capitalize(initialState)
                    yield runTokenizerTest, test
--- a/lib/html5lib/tests/test_treewalkers.py
+++ b/lib/html5lib/tests/test_treewalkers.py
@ -1,311 +0,0 @@
 import os
 import sys
 import StringIO
 import unittest
 import warnings
 warnings.simplefilter("error")
 from support import html5lib_test_files, TestData, convertExpected
 from html5lib import html5parser, treewalkers, treebuilders, constants
 from html5lib.filters.lint import Filter as LintFilter, LintError
 def PullDOMAdapter(node):
    from xml.dom import Node
    from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, COMMENT, CHARACTERS
    if node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
        for childNode in node.childNodes:
            for event in PullDOMAdapter(childNode):
                yield event
    elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
        raise NotImplementedError("DOCTYPE nodes are not supported by PullDOM")
    elif node.nodeType == Node.COMMENT_NODE:
        yield COMMENT, node
    elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
        yield CHARACTERS, node
    elif node.nodeType == Node.ELEMENT_NODE:
        yield START_ELEMENT, node
        for childNode in node.childNodes:
            for event in PullDOMAdapter(childNode):
                yield event
        yield END_ELEMENT, node
    else:
        raise NotImplementedError("Node type not supported: " + str(node.nodeType))
 treeTypes = {
 "simpletree":  {"builder": treebuilders.getTreeBuilder("simpletree"),
                "walker":  treewalkers.getTreeWalker("simpletree")},
 "DOM":         {"builder": treebuilders.getTreeBuilder("dom"),
                "walker":  treewalkers.getTreeWalker("dom")},
 "PullDOM":     {"builder": treebuilders.getTreeBuilder("dom"),
                "adapter": PullDOMAdapter,
                "walker":  treewalkers.getTreeWalker("pulldom")},
 }
 #Try whatever etree implementations are available from a list that are
 #"supposed" to work
 try:
    import xml.etree.ElementTree as ElementTree
    treeTypes['ElementTree'] = \
        {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
         "walker":  treewalkers.getTreeWalker("etree", ElementTree)}
 except ImportError:
    try:
        import elementtree.ElementTree as ElementTree
        treeTypes['ElementTree'] = \
            {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
             "walker":  treewalkers.getTreeWalker("etree", ElementTree)}
    except ImportError:
        pass
 try:
    import xml.etree.cElementTree as ElementTree
    treeTypes['cElementTree'] = \
        {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
         "walker":  treewalkers.getTreeWalker("etree", ElementTree)}
 except ImportError:
    try:
        import cElementTree as ElementTree
        treeTypes['cElementTree'] = \
            {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
             "walker":  treewalkers.getTreeWalker("etree", ElementTree)}
    except ImportError:
        pass
 try:
    import lxml.etree as ElementTree
 #    treeTypes['lxml_as_etree'] = \
 #        {"builder": treebuilders.getTreeBuilder("etree", ElementTree),
 #         "walker":  treewalkers.getTreeWalker("etree", ElementTree)}
    treeTypes['lxml_native'] = \
        {"builder": treebuilders.getTreeBuilder("lxml"),
         "walker":  treewalkers.getTreeWalker("lxml")}
 except ImportError:
    pass
 try:
    import BeautifulSoup
    treeTypes["beautifulsoup"] = \
        {"builder": treebuilders.getTreeBuilder("beautifulsoup"),
         "walker":  treewalkers.getTreeWalker("beautifulsoup")}
 except ImportError:
    pass
 #Try whatever etree implementations are available from a list that are
 #"supposed" to work
 try:
    import pxdom
    treeTypes['pxdom'] = \
        {"builder": treebuilders.getTreeBuilder("dom", pxdom),
         "walker":  treewalkers.getTreeWalker("dom")}
 except ImportError:
    pass
 try:
    from genshi.core import QName, Attrs
    from genshi.core import START, END, TEXT, COMMENT, DOCTYPE
    def GenshiAdapter(tree):
        text = None
        for token in treewalkers.getTreeWalker("simpletree")(tree):
            type = token["type"]
            if type in ("Characters", "SpaceCharacters"):
                if text is None:
                    text = token["data"]
                else:
                    text += token["data"]
            elif text is not None:
                yield TEXT, text, (None, -1, -1)
                text = None
            if type in ("StartTag", "EmptyTag"):
                if token["namespace"]:
                    name = u"{%s}%s" % (token["namespace"], token["name"])
                else:
                    name = token["name"]
                yield (START,
                       (QName(name),
                        Attrs([(QName(attr),value) for attr,value in token["data"]])),
                       (None, -1, -1))
                if type == "EmptyTag":
                    type = "EndTag"
            if type == "EndTag":
                yield END, QName(token["name"]), (None, -1, -1)
            elif type == "Comment":
                yield COMMENT, token["data"], (None, -1, -1)
            elif type == "Doctype":
                yield DOCTYPE, (token["name"], token["publicId"], 
                                token["systemId"]), (None, -1, -1)
            else:
                pass # FIXME: What to do?
        if text is not None:
            yield TEXT, text, (None, -1, -1)
    #treeTypes["genshi"] = \
    #    {"builder": treebuilders.getTreeBuilder("simpletree"),
    #     "adapter": GenshiAdapter,
    #     "walker":  treewalkers.getTreeWalker("genshi")}
 except ImportError:
    pass
 def concatenateCharacterTokens(tokens):
    charactersToken = None
    for token in tokens:
        type = token["type"]
        if type in ("Characters", "SpaceCharacters"):
            if charactersToken is None:
                charactersToken = {"type": "Characters", "data": token["data"]}
            else:
                charactersToken["data"] += token["data"]
        else:
            if charactersToken is not None:
                yield charactersToken
                charactersToken = None
            yield token
    if charactersToken is not None:
        yield charactersToken
 def convertTokens(tokens):
    output = []
    indent = 0
    for token in concatenateCharacterTokens(tokens):
        type = token["type"]
        if type in ("StartTag", "EmptyTag"):
            if (token["namespace"] and
                token["namespace"] != constants.namespaces["html"]):
                if token["namespace"] in constants.prefixes:
                    name = constants.prefixes[token["namespace"]]
                else:
                    name = token["namespace"]
                name += u" " + token["name"]
            else:
                name = token["name"]
            output.append(u"%s<%s>" % (" "*indent, name))
            indent += 2
            attrs = token["data"]
            if attrs:
                #TODO: Remove this if statement, attrs should always exist
                for (namespace,name),value in sorted(attrs.items()):
                    if namespace:
                        if namespace in constants.prefixes:
                            outputname = constants.prefixes[namespace]
                        else:
                            outputname = namespace
                        outputname += u" " + name
                    else:
                        outputname = name
                    output.append(u"%s%s=\"%s\"" % (" "*indent, outputname, value))
            if type == "EmptyTag":
                indent -= 2
        elif type == "EndTag":
            indent -= 2
        elif type == "Comment":
            output.append("%s<!-- %s -->" % (" "*indent, token["data"]))
        elif type == "Doctype":
            if token["name"]:
                if token["publicId"]:
                    output.append("""%s<!DOCTYPE %s "%s" "%s">"""% 
                                  (" "*indent, token["name"], 
                                   token["publicId"],
                                   token["systemId"] and token["systemId"] or ""))
                elif token["systemId"]:
                    output.append("""%s<!DOCTYPE %s "" "%s">"""% 
                                  (" "*indent, token["name"], 
                                   token["systemId"]))
                else:
                    output.append("%s<!DOCTYPE %s>"%(" "*indent,
                                                     token["name"]))
            else:
                output.append("%s<!DOCTYPE >" % (" "*indent,))
        elif type in ("Characters", "SpaceCharacters"):
            output.append("%s\"%s\"" % (" "*indent, token["data"]))
        else:
            pass # TODO: what to do with errors?
    return u"\n".join(output)
 import re
 attrlist = re.compile(r"^(\s+)\w+=.*(\n\1\w+=.*)+",re.M)
 def sortattrs(x):
  lines = x.group(0).split("\n")
  lines.sort()
  return "\n".join(lines)
 class TokenTestCase(unittest.TestCase):
    def test_all_tokens(self):
        expected = [
            {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'},
            {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'},
            {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'head'},
            {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'},
            {'data': u'a', 'type': 'Characters'},
            {'data': {}, 'type': 'StartTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'},
            {'data': u'b', 'type': 'Characters'},
            {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'div'},
            {'data': u'c', 'type': 'Characters'},
            {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'body'},
            {'data': {}, 'type': 'EndTag', 'namespace': u'http://www.w3.org/1999/xhtml', 'name': u'html'}
            ]
        for treeName, treeCls in treeTypes.iteritems():
            p = html5parser.HTMLParser(tree = treeCls["builder"])
            document = p.parse("<html><head></head><body>a<div>b</div>c</body></html>")
            document = treeCls.get("adapter", lambda x: x)(document)
            output = treeCls["walker"](document)
            for expectedToken, outputToken in zip(expected, output):
                self.assertEquals(expectedToken, outputToken)
 def run_test(innerHTML, input, expected, errors, treeClass):
    try:
        p = html5parser.HTMLParser(tree = treeClass["builder"])
        if innerHTML:
            document = p.parseFragment(StringIO.StringIO(input), innerHTML)
        else:
            document = p.parse(StringIO.StringIO(input))
    except constants.DataLossWarning:
        #Ignore testcases we know we don't pass
        return
    document = treeClass.get("adapter", lambda x: x)(document)
    try:
        output = convertTokens(treeClass["walker"](document))
        output = attrlist.sub(sortattrs, output)
        expected = attrlist.sub(sortattrs, convertExpected(expected))
        assert expected == output, "\n".join([
                "", "Input:", input,
                "", "Expected:", expected,
                "", "Received:", output
                ])
    except NotImplementedError:
        pass # Amnesty for those that confess...
 def test_treewalker():
    sys.stdout.write('Testing tree walkers '+ " ".join(treeTypes.keys()) + "\n")
    for treeName, treeCls in treeTypes.iteritems():
        files = html5lib_test_files('tree-construction')
        for filename in files:
            testName = os.path.basename(filename).replace(".dat","")
            tests = TestData(filename, "data")
            for index, test in enumerate(tests):
                (input, errors,
                 innerHTML, expected) = [test[key] for key in ("data", "errors",
                                                               "document-fragment",
                                                               "document")]
                errors = errors.split("\n")
                yield run_test, innerHTML, input, expected, errors, treeCls
--- a/lib/html5lib/tests/test_whitespace_filter.py
+++ b/lib/html5lib/tests/test_whitespace_filter.py
@ -1,123 +0,0 @@
 import unittest
 from html5lib.filters.whitespace import Filter
 from html5lib.constants import spaceCharacters
 spaceCharacters = u"".join(spaceCharacters)
 class TestCase(unittest.TestCase):
    def runTest(self, input, expected):
        output = list(Filter(input))
        errorMsg = "\n".join(["\n\nInput:", str(input),
                              "\nExpected:", str(expected),
                              "\nReceived:", str(output)])
        self.assertEquals(output, expected, errorMsg)
    def runTestUnmodifiedOutput(self, input):
        self.runTest(input, input)
    def testPhrasingElements(self):
        self.runTestUnmodifiedOutput(
            [{"type": u"Characters", "data": u"This is a " },
             {"type": u"StartTag", "name": u"span", "data": [] },
             {"type": u"Characters", "data": u"phrase" },
             {"type": u"EndTag", "name": u"span", "data": []},
             {"type": u"SpaceCharacters", "data": u" " },
             {"type": u"Characters", "data": u"with" },
             {"type": u"SpaceCharacters", "data": u" " },
             {"type": u"StartTag", "name": u"em", "data": [] },
             {"type": u"Characters", "data": u"emphasised text" },
             {"type": u"EndTag", "name": u"em", "data": []},
             {"type": u"Characters", "data": u" and an " },
             {"type": u"StartTag", "name": u"img", "data": [[u"alt", u"image"]] },
             {"type": u"Characters", "data": u"." }])
    def testLeadingWhitespace(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"SpaceCharacters", "data": spaceCharacters},
             {"type": u"Characters", "data": u"foo"},
             {"type": u"EndTag", "name": u"p", "data": []}],
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"SpaceCharacters", "data": u" "},
             {"type": u"Characters", "data": u"foo"},
             {"type": u"EndTag", "name": u"p", "data": []}])
    def testLeadingWhitespaceAsCharacters(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": spaceCharacters + u"foo"},
             {"type": u"EndTag", "name": u"p", "data": []}],
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u" foo"},
             {"type": u"EndTag", "name": u"p", "data": []}])
    def testTrailingWhitespace(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u"foo"},
             {"type": u"SpaceCharacters", "data": spaceCharacters},
             {"type": u"EndTag", "name": u"p", "data": []}],
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u"foo"},
             {"type": u"SpaceCharacters", "data": u" "},
             {"type": u"EndTag", "name": u"p", "data": []}])
    def testTrailingWhitespaceAsCharacters(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u"foo" + spaceCharacters},
             {"type": u"EndTag", "name": u"p", "data": []}],
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u"foo "},
             {"type": u"EndTag", "name": u"p", "data": []}])
    def testWhitespace(self):
        self.runTest(
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u"foo" + spaceCharacters + "bar"},
             {"type": u"EndTag", "name": u"p", "data": []}],
            [{"type": u"StartTag", "name": u"p", "data": []},
             {"type": u"Characters", "data": u"foo bar"},
             {"type": u"EndTag", "name": u"p", "data": []}])
    def testLeadingWhitespaceInPre(self):
        self.runTestUnmodifiedOutput(
            [{"type": u"StartTag", "name": u"pre", "data": []},
             {"type": u"SpaceCharacters", "data": spaceCharacters},
             {"type": u"Characters", "data": u"foo"},
             {"type": u"EndTag", "name": u"pre", "data": []}])
    def testLeadingWhitespaceAsCharactersInPre(self):
        self.runTestUnmodifiedOutput(
            [{"type": u"StartTag", "name": u"pre", "data": []},
             {"type": u"Characters", "data": spaceCharacters + u"foo"},
             {"type": u"EndTag", "name": u"pre", "data": []}])
    def testTrailingWhitespaceInPre(self):
        self.runTestUnmodifiedOutput(
            [{"type": u"StartTag", "name": u"pre", "data": []},
             {"type": u"Characters", "data": u"foo"},
             {"type": u"SpaceCharacters", "data": spaceCharacters},
             {"type": u"EndTag", "name": u"pre", "data": []}])
    def testTrailingWhitespaceAsCharactersInPre(self):
        self.runTestUnmodifiedOutput(
            [{"type": u"StartTag", "name": u"pre", "data": []},
             {"type": u"Characters", "data": u"foo" + spaceCharacters},
             {"type": u"EndTag", "name": u"pre", "data": []}])
    def testWhitespaceInPre(self):
        self.runTestUnmodifiedOutput(
            [{"type": u"StartTag", "name": u"pre", "data": []},
             {"type": u"Characters", "data": u"foo" + spaceCharacters + "bar"},
             {"type": u"EndTag", "name": u"pre", "data": []}])
 def buildTestSuite():
    return unittest.defaultTestLoader.loadTestsFromName(__name__)
 def main():
    buildTestSuite()
    unittest.main()
 if __name__ == "__main__":
    main()
--- a/lib/html5lib/tests/testdata/encoding/test-yahoo-jp.dat
+++ b/lib/html5lib/tests/testdata/encoding/test-yahoo-jp.dat
@ -1,10 +0,0 @@
 #data
 <html>
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
 <!--京-->
 <title>Yahoo! JAPAN</title>
 <meta name="description" content="日本最大級のポータルサイト。検索、オークション、ニュース、メール、コミュニティ、ショッピング、など80以上のサービスを展開。あなたの生活をより豊かにする「ライフ・エンジン」を目指していきます。">
 <style type="text/css" media="all">
 #encoding
 euc_jp
--- a/lib/html5lib/tests/testdata/encoding/tests1.dat
+++ b/lib/html5lib/tests/testdata/encoding/tests1.dat
--- a/lib/html5lib/tests/testdata/encoding/tests2.dat
+++ b/lib/html5lib/tests/testdata/encoding/tests2.dat
@ -1,115 +0,0 @@
 #data
 <meta
 #encoding
 windows-1252
 #data
 <
 #encoding
 windows-1252
 #data
 <!
 #encoding
 windows-1252
 #data
 <meta charset = "
 #encoding
 windows-1252
 #data
 <meta charset=euc_jp
 #encoding
 windows-1252
 #data
 <meta <meta charset='euc_jp'>
 #encoding
 euc_jp
 #data
 <meta       charset    =     'euc_jp'>
 #encoding
 euc_jp
 #data
 <!-- -->
 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
 #encoding
 utf-8
 #data
 <!-- -->
 <meta http-equiv="Content-Type" content="text/html; charset=utf
 #encoding
 windows-1252
 #data
 <meta http-equiv="Content-Type<meta charset="utf-8">
 #encoding
 windows-1252
 #data
 <meta http-equiv="Content-Type" content="text/html; charset='utf-8'">
 #encoding
 utf-8
 #data
 <meta http-equiv="Content-Type" content="text/html; charset='utf-8">
 #encoding
 windows-1252
 #data
 <meta                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
 #encoding
 windows-1252
 #data
 <meta charset                    =                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
 #encoding
 windows-1252
 #data
 <meta charset=                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            utf-8
 >
 #encoding
 utf-8
 #data
 <meta content = "text/html;
 #encoding
 windows-1252
 #data
 <meta charset="UTF-16">
 #encoding
 utf-8
 #data
 <meta charset="UTF-16LE">
 #encoding
 utf-8
 #data
 <meta charset="UTF-16BE">
 #encoding
 utf-8
 #data
 <html a=ñ>
 <meta charset="utf-8">
 #encoding
 utf-8
 #data
 <html ñ>
 <meta charset="utf-8">
 #encoding
 utf-8
 #data
 <html>ñ
 <meta charset="utf-8">
 #encoding
 utf-8
--- a/lib/html5lib/tests/testdata/sanitizer/tests1.dat
+++ b/lib/html5lib/tests/testdata/sanitizer/tests1.dat
@ -1,501 +0,0 @@
 [
  {
    "name": "IE_Comments",
    "input": "<!--[if gte IE 4]><script>alert('XSS');</script><![endif]-->",
    "output": ""
  },
  {
    "name": "IE_Comments_2",
    "input": "<![if !IE 5]><script>alert('XSS');</script><![endif]>",
    "output": "&lt;script&gt;alert('XSS');&lt;/script&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "allow_colons_in_path_component",
    "input": "<a href=\"./this:that\">foo</a>",
    "output": "<a href='./this:that'>foo</a>"
  },
  {
    "name": "background_attribute",
    "input": "<div background=\"javascript:alert('XSS')\"></div>",
    "output": "<div/>",
    "xhtml": "<div></div>",
    "rexml": "<div></div>"
  },
  {
    "name": "bgsound",
    "input": "<bgsound src=\"javascript:alert('XSS');\" />",
    "output": "&lt;bgsound src=\"javascript:alert('XSS');\"/&gt;",
    "rexml": "&lt;bgsound src=\"javascript:alert('XSS');\"&gt;&lt;/bgsound&gt;"
  },
  {
    "name": "div_background_image_unicode_encoded",
    "input": "<div style=\"background-image:\u00a5\u00a2\u006C\u0028'\u006a\u0061\u00a6\u0061\u00a3\u0063\u00a2\u0069\u00a0\u00a4\u003a\u0061\u006c\u0065\u00a2\u00a4\u0028.1027\u0058.1053\u0053\u0027\u0029'\u0029\">foo</div>",
    "output": "<div style=''>foo</div>"
  },
  {
    "name": "div_expression",
    "input": "<div style=\"width: expression(alert('XSS'));\">foo</div>",
    "output": "<div style=''>foo</div>"
  },
  {
    "name": "double_open_angle_brackets",
    "input": "<img src=http://ha.ckers.org/scriptlet.html <",
    "output": "<img src='http://ha.ckers.org/scriptlet.html'>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "double_open_angle_brackets_2",
    "input": "<script src=http://ha.ckers.org/scriptlet.html <",
    "output": "&lt;script src=\"http://ha.ckers.org/scriptlet.html\" &lt;=\"\"&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "grave_accents",
    "input": "<img src=`javascript:alert('XSS')` />",
    "output": "<img/>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "img_dynsrc_lowsrc",
    "input": "<img dynsrc=\"javascript:alert('XSS')\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "img_vbscript",
    "input": "<img src='vbscript:msgbox(\"XSS\")' />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "input_image",
    "input": "<input type=\"image\" src=\"javascript:alert('XSS');\" />",
    "output": "<input type='image'/>",
    "rexml": "<input type='image' />"
  },
  {
    "name": "link_stylesheets",
    "input": "<link rel=\"stylesheet\" href=\"javascript:alert('XSS');\" />",
    "output": "&lt;link rel=\"stylesheet\" href=\"javascript:alert('XSS');\"/&gt;",
    "rexml": "&lt;link href=\"javascript:alert('XSS');\" rel=\"stylesheet\"/&gt;"
  },
  {
    "name": "link_stylesheets_2",
    "input": "<link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\" />",
    "output": "&lt;link rel=\"stylesheet\" href=\"http://ha.ckers.org/xss.css\"/&gt;",
    "rexml": "&lt;link href=\"http://ha.ckers.org/xss.css\" rel=\"stylesheet\"/&gt;"
  },
  {
    "name": "list_style_image",
    "input": "<li style=\"list-style-image: url(javascript:alert('XSS'))\">foo</li>",
    "output": "<li style=''>foo</li>"
  },
  {
    "name": "no_closing_script_tags",
    "input": "<script src=http://ha.ckers.org/xss.js?<b>",
    "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "non_alpha_non_digit",
    "input": "<script/XSS src=\"http://ha.ckers.org/xss.js\"></script>",
    "output": "&lt;script XSS=\"\" src=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "non_alpha_non_digit_2",
    "input": "<a onclick!\\#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>foo</a>",
    "output": "<a>foo</a>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "non_alpha_non_digit_3",
    "input": "<img/src=\"http://ha.ckers.org/xss.js\"/>",
    "output": "<img src='http://ha.ckers.org/xss.js'/>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "non_alpha_non_digit_II",
    "input": "<a href!\\#$%&()*~+-_.,:;?@[/|]^`=alert('XSS')>foo</a>",
    "output": "<a>foo</a>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "non_alpha_non_digit_III",
    "input": "<a/href=\"javascript:alert('XSS');\">foo</a>",
    "output": "<a>foo</a>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "platypus",
    "input": "<a href=\"http://www.ragingplatypus.com/\" style=\"display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;\">never trust your upstream platypus</a>",
    "output": "<a href='http://www.ragingplatypus.com/' style='display: block; width: 100%; height: 100%; background-color: black; background-x: center; background-y: center;'>never trust your upstream platypus</a>"
  },
  {
    "name": "protocol_resolution_in_script_tag",
    "input": "<script src=//ha.ckers.org/.j></script>",
    "output": "&lt;script src=\"//ha.ckers.org/.j\"&gt;&lt;/script&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_allow_anchors",
    "input": "<a href='foo' onclick='bar'><script>baz</script></a>",
    "output": "<a href='foo'>&lt;script&gt;baz&lt;/script&gt;</a>"
  },
  {
    "name": "should_allow_image_alt_attribute",
    "input": "<img alt='foo' onclick='bar' />",
    "output": "<img alt='foo'/>",
    "rexml": "<img alt='foo' />"
  },
  {
    "name": "should_allow_image_height_attribute",
    "input": "<img height='foo' onclick='bar' />",
    "output": "<img height='foo'/>",
    "rexml": "<img height='foo' />"
  },
  {
    "name": "should_allow_image_src_attribute",
    "input": "<img src='foo' onclick='bar' />",
    "output": "<img src='foo'/>",
    "rexml": "<img src='foo' />"
  },
  {
    "name": "should_allow_image_width_attribute",
    "input": "<img width='foo' onclick='bar' />",
    "output": "<img width='foo'/>",
    "rexml": "<img width='foo' />"
  },
  {
    "name": "should_handle_blank_text",
    "input": "",
    "output": ""
  },
  {
    "name": "should_handle_malformed_image_tags",
    "input": "<img \"\"\"><script>alert(\"XSS\")</script>\">",
    "output": "<img/>&lt;script&gt;alert(\"XSS\")&lt;/script&gt;\"&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_handle_non_html",
    "input": "abc",
    "output": "abc"
  },
  {
    "name": "should_not_fall_for_ridiculous_hack",
    "input": "<img\nsrc\n=\n\"\nj\na\nv\na\ns\nc\nr\ni\np\nt\n:\na\nl\ne\nr\nt\n(\n'\nX\nS\nS\n'\n)\n\"\n />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_0",
    "input": "<img src=\"javascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_1",
    "input": "<img src=javascript:alert('XSS') />",
    "output": "<img/>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_10",
    "input": "<img src=\"jav&#x0A;ascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_11",
    "input": "<img src=\"jav&#x0D;ascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_12",
    "input": "<img src=\" &#14;  javascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_13",
    "input": "<img src=\"&#x20;javascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_14",
    "input": "<img src=\"&#xA0;javascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_2",
    "input": "<img src=\"JaVaScRiPt:alert('XSS')\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_3",
    "input": "<img src='javascript:alert(&quot;XSS&quot;)' />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_4",
    "input": "<img src='javascript:alert(String.fromCharCode(88,83,83))' />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_5",
    "input": "<img src='&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41;' />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_6",
    "input": "<img src='&#0000106;&#0000097;&#0000118;&#0000097;&#0000115;&#0000099;&#0000114;&#0000105;&#0000112;&#0000116;&#0000058;&#0000097;&#0000108;&#0000101;&#0000114;&#0000116;&#0000040;&#0000039;&#0000088;&#0000083;&#0000083;&#0000039;&#0000041' />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_7",
    "input": "<img src='&#x6A;&#x61;&#x76;&#x61;&#x73;&#x63;&#x72;&#x69;&#x70;&#x74;&#x3A;&#x61;&#x6C;&#x65;&#x72;&#x74;&#x28;&#x27;&#x58;&#x53;&#x53;&#x27;&#x29' />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_8",
    "input": "<img src=\"jav\tascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_not_fall_for_xss_image_hack_9",
    "input": "<img src=\"jav&#x09;ascript:alert('XSS');\" />",
    "output": "<img/>",
    "rexml": "<img />"
  },
  {
    "name": "should_sanitize_half_open_scripts",
    "input": "<img src=\"javascript:alert('XSS')\"",
    "output": "<img/>",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_sanitize_invalid_script_tag",
    "input": "<script/XSS SRC=\"http://ha.ckers.org/xss.js\"></script>",
    "output": "&lt;script XSS=\"\" SRC=\"http://ha.ckers.org/xss.js\"&gt;&lt;/script&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_sanitize_script_tag_with_multiple_open_brackets",
    "input": "<<script>alert(\"XSS\");//<</script>",
    "output": "&lt;&lt;script&gt;alert(\"XSS\");//&lt;&lt;/script&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_sanitize_script_tag_with_multiple_open_brackets_2",
    "input": "<iframe src=http://ha.ckers.org/scriptlet.html\n<",
    "output": "&lt;iframe src=\"http://ha.ckers.org/scriptlet.html\" &lt;=\"\"&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_sanitize_tag_broken_up_by_null",
    "input": "<scr\u0000ipt>alert(\"XSS\")</scr\u0000ipt>",
    "output": "&lt;scr\ufffdipt&gt;alert(\"XSS\")&lt;/scr\ufffdipt&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_sanitize_unclosed_script",
    "input": "<script src=http://ha.ckers.org/xss.js?<b>",
    "output": "&lt;script src=\"http://ha.ckers.org/xss.js?&amp;lt;b\"&gt;",
    "rexml": "Ill-formed XHTML!"
  },
  {
    "name": "should_strip_href_attribute_in_a_with_bad_protocols",
    "input": "<a href=\"javascript:XSS\" title=\"1\">boo</a>",
    "output": "<a title='1'>boo</a>"
  },
  {
    "name": "should_strip_href_attribute_in_a_with_bad_protocols_and_whitespace",
    "input": "<a href=\" javascript:XSS\" title=\"1\">boo</a>",
    "output": "<a title='1'>boo</a>"
  },
  {
    "name": "should_strip_src_attribute_in_img_with_bad_protocols",
    "input": "<img src=\"javascript:XSS\" title=\"1\">boo</img>",
    "output": "<img title='1'/>boo",
    "rexml": "<img title='1' />"
  },
  {
    "name": "should_strip_src_attribute_in_img_with_bad_protocols_and_whitespace",
    "input": "<img src=\" javascript:XSS\" title=\"1\">boo</img>",
    "output": "<img title='1'/>boo",
    "rexml": "<img title='1' />"
  },
  {
    "name": "xml_base",
    "input": "<div xml:base=\"javascript:alert('XSS');//\">foo</div>",
    "output": "<div>foo</div>"
  },
  {
    "name": "xul",
    "input": "<p style=\"-moz-binding:url('http://ha.ckers.org/xssmoz.xml#xss')\">fubar</p>",
    "output": "<p style=''>fubar</p>"
  },
  {
    "name": "quotes_in_attributes",
    "input": "<img src='foo' title='\"foo\" bar' />",
    "rexml": "<img src='foo' title='\"foo\" bar' />",
    "output": "<img title='&quot;foo&quot; bar' src='foo'/>"
  },
  {
    "name": "uri_refs_in_svg_attributes",
    "input": "<rect fill='url(#foo)' />",
    "rexml": "<rect fill='url(#foo)'></rect>",
    "xhtml": "<rect fill='url(#foo)'></rect>",
    "output": "<rect fill='url(#foo)'/>"
  },
  {
    "name": "absolute_uri_refs_in_svg_attributes",
    "input": "<rect fill='url(http://bad.com/) #fff' />",
    "rexml": "<rect fill='  #fff'></rect>",
    "xhtml": "<rect fill='  #fff'></rect>",
    "output": "<rect fill='  #fff'/>"
  },
  {
    "name": "uri_ref_with_space_in svg_attribute",
    "input": "<rect fill='url(\n#foo)' />",
    "rexml": "<rect fill='url(\n#foo)'></rect>",
    "xhtml": "<rect fill='url(\n#foo)'></rect>",
    "output": "<rect fill='url(\n#foo)'/>"
  },
  {
    "name": "absolute_uri_ref_with_space_in svg_attribute",
    "input": "<rect fill=\"url(\nhttp://bad.com/)\" />",
    "rexml": "<rect fill=' '></rect>",
    "xhtml": "<rect fill=' '></rect>",
    "output": "<rect fill=' '/>"
  },
  {
    "name": "allow_html5_image_tag",
    "input": "<image src='foo' />",
    "rexml": "&lt;image src=\"foo\"&gt;&lt;/image&gt;",
    "output": "&lt;image src=\"foo\"/&gt;"
  },
  {
    "name": "style_attr_end_with_nothing",
    "input": "<div style=\"color: blue\" />",
    "output": "<div style='color: blue;'/>",
    "xhtml": "<div style='color: blue;'></div>",
    "rexml": "<div style='color: blue;'></div>"
  },
  {
    "name": "style_attr_end_with_space",
    "input": "<div style=\"color: blue \" />",
    "output": "<div style='color: blue ;'/>",
    "xhtml": "<div style='color: blue ;'></div>",
    "rexml": "<div style='color: blue ;'></div>"
  },
  {
    "name": "style_attr_end_with_semicolon",
    "input": "<div style=\"color: blue;\" />",
    "output": "<div style='color: blue;'/>",
    "xhtml": "<div style='color: blue;'></div>",
    "rexml": "<div style='color: blue;'></div>"
  },
  {
    "name": "style_attr_end_with_semicolon_space",
    "input": "<div style=\"color: blue; \" />",
    "output": "<div style='color: blue;'/>",
    "xhtml": "<div style='color: blue;'></div>",
    "rexml": "<div style='color: blue;'></div>"
  },
  {
   "name": "attributes_with_embedded_quotes",
   "input": "<img src=doesntexist.jpg\"'onerror=\"alert(1) />",
   "output": "<img src='doesntexist.jpg&quot;&apos;onerror=&quot;alert(1)'/>",
   "rexml": "Ill-formed XHTML!"
  },
  {
   "name": "attributes_with_embedded_quotes_II",
   "input": "<img src=notthere.jpg\"\"onerror=\"alert(2) />",
   "output": "<img src='notthere.jpg&quot;&quot;onerror=&quot;alert(2)'/>",
   "rexml": "Ill-formed XHTML!"
  }
 ]
--- a/lib/html5lib/tests/testdata/serializer/core.test
+++ b/lib/html5lib/tests/testdata/serializer/core.test
@ -1,125 +0,0 @@
 {"tests": [
 {"description": "proper attribute value escaping",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test \"with\" &quot;"}]]],
 "expected": ["<span title='test \"with\" &amp;quot;'>"]
 },
 {"description": "proper attribute value non-quoting",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo"}]]],
 "expected": ["<span title=foo>"],
 "xhtml":    ["<span title=\"foo\">"]
 },
 {"description": "proper attribute value non-quoting (with <)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo<bar"}]]],
 "expected": ["<span title=foo<bar>"],
 "xhtml":    ["<span title=\"foo&lt;bar\">"]
 },
 {"description": "proper attribute value quoting (with =)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo=bar"}]]],
 "expected": ["<span title=\"foo=bar\">"]
 },
 {"description": "proper attribute value quoting (with >)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo>bar"}]]],
 "expected": ["<span title=\"foo>bar\">"]
 },
 {"description": "proper attribute value quoting (with \")",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\"bar"}]]],
 "expected": ["<span title='foo\"bar'>"]
 },
 {"description": "proper attribute value quoting (with ')",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar"}]]],
 "expected": ["<span title=\"foo'bar\">"]
 },
 {"description": "proper attribute value quoting (with both \" and ')",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo'bar\"baz"}]]],
 "expected": ["<span title=\"foo'bar&quot;baz\">"]
 },
 {"description": "proper attribute value quoting (with space)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo bar"}]]],
 "expected": ["<span title=\"foo bar\">"]
 },
 {"description": "proper attribute value quoting (with tab)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\tbar"}]]],
 "expected": ["<span title=\"foo\tbar\">"]
 },
 {"description": "proper attribute value quoting (with LF)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\nbar"}]]],
 "expected": ["<span title=\"foo\nbar\">"]
 },
 {"description": "proper attribute value quoting (with CR)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\rbar"}]]],
 "expected": ["<span title=\"foo\rbar\">"]
 },
 {"description": "proper attribute value non-quoting (with linetab)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Bbar"}]]],
 "expected": ["<span title=foo\u000Bbar>"],
 "xhtml": ["<span title=\"foo\u000Bbar\">"]
 },
 {"description": "proper attribute value quoting (with form feed)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "foo\u000Cbar"}]]],
 "expected": ["<span title=\"foo\u000Cbar\">"]
 },
 {"description": "void element (as EmptyTag token)",
 "input": [["EmptyTag", "img", {}]],
 "expected": ["<img>"],
 "xhtml":    ["<img />"]
 },
 {"description": "void element (as StartTag token)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "img", {}]],
 "expected": ["<img>"],
 "xhtml":    ["<img />"]
 },
 {"description": "doctype in error",
 "input": [["Doctype", "foo"]],
 "expected": ["<!DOCTYPE foo>"]
 },
 {"description": "character data",
 "options": {"encoding":"utf-8"},
 "input": [["Characters", "a<b>c&d"]],
 "expected": ["a&lt;b&gt;c&amp;d"]
 },
 {"description": "rcdata",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
 "expected": ["<script>a<b>c&d"],
 "xhtml": ["<script>a&lt;b&gt;c&amp;d"]
 },
 {"description": "doctype",
 "input": [["Doctype", "HTML"]],
 "expected": ["<!DOCTYPE HTML>"]
 },
 {"description": "HTML 4.01 DOCTYPE",
 "input": [["Doctype", "HTML",  "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd"]],
 "expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">"]
 },
 {"description": "HTML 4.01 DOCTYPE without system identifer",
 "input": [["Doctype", "HTML",  "-//W3C//DTD HTML 4.01//EN"]],
 "expected": ["<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\">"]
 },
 {"description": "IBM DOCTYPE without public identifer",
 "input": [["Doctype", "html",  "", "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"]],
 "expected": ["<!DOCTYPE html SYSTEM \"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd\">"]
 }
 ]}
--- a/lib/html5lib/tests/testdata/serializer/injectmeta.test
+++ b/lib/html5lib/tests/testdata/serializer/injectmeta.test
@ -1,66 +0,0 @@
 {"tests": [
 {"description": "no encoding",
 "options": {"inject_meta_charset": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": [""],
 "xhtml": ["<head></head>"]
 },
 {"description": "empytag head",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta charset=utf-8>"],
 "xhtml":    ["<head><meta charset=\"utf-8\" /></head>"]
 },
 {"description": "head w/title",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml","title",{}], ["Characters", "foo"],["EndTag", "http://www.w3.org/1999/xhtml", "title"], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta charset=utf-8><title>foo</title>"],
 "xhtml":    ["<head><meta charset=\"utf-8\" /><title>foo</title></head>"]
 },
 {"description": "head w/meta-charset",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta charset=utf-8>"],
 "xhtml":    ["<head><meta charset=\"utf-8\" /></head>"]
 },
 {"description": "head w/ two meta-charset",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta charset=utf-8><meta charset=utf-8>", "<head><meta charset=utf-8><meta charset=ascii>"],
 "xhtml": ["<head><meta charset=\"utf-8\" /><meta charset=\"utf-8\" /></head>", "<head><meta charset=\"utf-8\" /><meta charset=\"ascii\" /></head>"]
 },
 {"description": "head w/robots",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta charset=utf-8><meta content=noindex name=robots>"],
 "xhtml":    ["<head><meta charset=\"utf-8\" /><meta content=\"noindex\" name=\"robots\" /></head>"]
 },
 {"description": "head w/robots & charset",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "charset", "value": "ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta content=noindex name=robots><meta charset=utf-8>"],
 "xhtml":    ["<head><meta content=\"noindex\" name=\"robots\" /><meta charset=\"utf-8\" /></head>"]
 },
 {"description": "head w/ charset in http-equiv content-type",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
 "xhtml":    ["<head><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
 },
 {"description": "head w/robots & charset in http-equiv content-type",
 "options": {"inject_meta_charset": true, "encoding":"utf-8"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag","meta",[{"namespace": null, "name": "name", "value": "robots"},{"namespace": null, "name": "content", "value": "noindex"}]], ["EmptyTag","meta",[{"namespace": null, "name": "http-equiv", "value": "content-type"}, {"namespace": null, "name": "content", "value": "text/html; charset=ascii"}]], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": ["<meta content=noindex name=robots><meta content=\"text/html; charset=utf-8\" http-equiv=content-type>"],
 "xhtml": ["<head><meta content=\"noindex\" name=\"robots\" /><meta content=\"text/html; charset=utf-8\" http-equiv=\"content-type\" /></head>"]
 }
 ]}
--- a/lib/html5lib/tests/testdata/serializer/optionaltags.test
+++ b/lib/html5lib/tests/testdata/serializer/optionaltags.test
@ -1,965 +0,0 @@
 {"tests": [
 {"description": "html start-tag followed by text, with attributes",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", [{"namespace": null, "name": "lang", "value": "en"}]], ["Characters", "foo"]],
 "expected": ["<html lang=en>foo"]
 },
 {"description": "html start-tag followed by comment",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Comment", "foo"]],
 "expected": ["<html><!--foo-->"]
 },
 {"description": "html start-tag followed by space character",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", " foo"]],
 "expected": ["<html> foo"]
 },
 {"description": "html start-tag followed by text",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["Characters", "foo"]],
 "expected": ["foo"]
 },
 {"description": "html start-tag followed by start-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "html start-tag followed by end-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "html start-tag at EOF (shouldn't ever happen?!)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "html", {}]],
 "expected": [""]
 },
 {"description": "html end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Comment", "foo"]],
 "expected": ["</html><!--foo-->"]
 },
 {"description": "html end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", " foo"]],
 "expected": ["</html> foo"]
 },
 {"description": "html end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["Characters", "foo"]],
 "expected": ["foo"]
 },
 {"description": "html end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "html end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "html end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "html"]],
 "expected": [""]
 },
 {"description": "head start-tag followed by comment",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Comment", "foo"]],
 "expected": ["<head><!--foo-->"]
 },
 {"description": "head start-tag followed by space character",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", " foo"]],
 "expected": ["<head> foo"]
 },
 {"description": "head start-tag followed by text",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["Characters", "foo"]],
 "expected": ["<head>foo"]
 },
 {"description": "head start-tag followed by start-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "head start-tag followed by end-tag (shouldn't ever happen?!)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["<head></foo>", "</foo>"]
 },
 {"description": "empty head element",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": [""]
 },
 {"description": "head start-tag followed by empty-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}], ["EmptyTag", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "head start-tag at EOF (shouldn't ever happen?!)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "head", {}]],
 "expected": ["<head>", ""]
 },
 {"description": "head end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Comment", "foo"]],
 "expected": ["</head><!--foo-->"]
 },
 {"description": "head end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", " foo"]],
 "expected": ["</head> foo"]
 },
 {"description": "head end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["Characters", "foo"]],
 "expected": ["foo"]
 },
 {"description": "head end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "head end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "head end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "head"]],
 "expected": [""]
 },
 {"description": "body start-tag followed by comment",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Comment", "foo"]],
 "expected": ["<body><!--foo-->"]
 },
 {"description": "body start-tag followed by space character",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", " foo"]],
 "expected": ["<body> foo"]
 },
 {"description": "body start-tag followed by text",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["Characters", "foo"]],
 "expected": ["foo"]
 },
 {"description": "body start-tag followed by start-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "body start-tag followed by end-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "body start-tag at EOF (shouldn't ever happen?!)",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "body", {}]],
 "expected": [""]
 },
 {"description": "body end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Comment", "foo"]],
 "expected": ["</body><!--foo-->"]
 },
 {"description": "body end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", " foo"]],
 "expected": ["</body> foo"]
 },
 {"description": "body end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["Characters", "foo"]],
 "expected": ["foo"]
 },
 {"description": "body end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "body end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "body end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "body"]],
 "expected": [""]
 },
 {"description": "li end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Comment", "foo"]],
 "expected": ["</li><!--foo-->"]
 },
 {"description": "li end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", " foo"]],
 "expected": ["</li> foo"]
 },
 {"description": "li end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["Characters", "foo"]],
 "expected": ["</li>foo"]
 },
 {"description": "li end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</li><foo>"]
 },
 {"description": "li end-tag followed by li start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["StartTag", "http://www.w3.org/1999/xhtml", "li", {}]],
 "expected": ["<li>"]
 },
 {"description": "li end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "li end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "li"]],
 "expected": [""]
 },
 {"description": "dt end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Comment", "foo"]],
 "expected": ["</dt><!--foo-->"]
 },
 {"description": "dt end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", " foo"]],
 "expected": ["</dt> foo"]
 },
 {"description": "dt end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["Characters", "foo"]],
 "expected": ["</dt>foo"]
 },
 {"description": "dt end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</dt><foo>"]
 },
 {"description": "dt end-tag followed by dt start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
 "expected": ["<dt>"]
 },
 {"description": "dt end-tag followed by dd start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
 "expected": ["<dd>"]
 },
 {"description": "dt end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</dt></foo>"]
 },
 {"description": "dt end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dt"]],
 "expected": ["</dt>"]
 },
 {"description": "dd end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Comment", "foo"]],
 "expected": ["</dd><!--foo-->"]
 },
 {"description": "dd end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", " foo"]],
 "expected": ["</dd> foo"]
 },
 {"description": "dd end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["Characters", "foo"]],
 "expected": ["</dd>foo"]
 },
 {"description": "dd end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</dd><foo>"]
 },
 {"description": "dd end-tag followed by dd start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dd", {}]],
 "expected": ["<dd>"]
 },
 {"description": "dd end-tag followed by dt start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["StartTag", "http://www.w3.org/1999/xhtml", "dt", {}]],
 "expected": ["<dt>"]
 },
 {"description": "dd end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "dd end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "dd"]],
 "expected": [""]
 },
 {"description": "p end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Comment", "foo"]],
 "expected": ["</p><!--foo-->"]
 },
 {"description": "p end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", " foo"]],
 "expected": ["</p> foo"]
 },
 {"description": "p end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["Characters", "foo"]],
 "expected": ["</p>foo"]
 },
 {"description": "p end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</p><foo>"]
 },
 {"description": "p end-tag followed by address start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "address", {}]],
 "expected": ["<address>"]
 },
 {"description": "p end-tag followed by article start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "article", {}]],
 "expected": ["<article>"]
 },
 {"description": "p end-tag followed by aside start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "aside", {}]],
 "expected": ["<aside>"]
 },
 {"description": "p end-tag followed by blockquote start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "blockquote", {}]],
 "expected": ["<blockquote>"]
 },
 {"description": "p end-tag followed by datagrid start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "datagrid", {}]],
 "expected": ["<datagrid>"]
 },
 {"description": "p end-tag followed by dialog start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dialog", {}]],
 "expected": ["<dialog>"]
 },
 {"description": "p end-tag followed by dir start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dir", {}]],
 "expected": ["<dir>"]
 },
 {"description": "p end-tag followed by div start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
 "expected": ["<div>"]
 },
 {"description": "p end-tag followed by dl start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "dl", {}]],
 "expected": ["<dl>"]
 },
 {"description": "p end-tag followed by fieldset start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "fieldset", {}]],
 "expected": ["<fieldset>"]
 },
 {"description": "p end-tag followed by footer start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "footer", {}]],
 "expected": ["<footer>"]
 },
 {"description": "p end-tag followed by form start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "form", {}]],
 "expected": ["<form>"]
 },
 {"description": "p end-tag followed by h1 start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h1", {}]],
 "expected": ["<h1>"]
 },
 {"description": "p end-tag followed by h2 start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h2", {}]],
 "expected": ["<h2>"]
 },
 {"description": "p end-tag followed by h3 start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h3", {}]],
 "expected": ["<h3>"]
 },
 {"description": "p end-tag followed by h4 start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h4", {}]],
 "expected": ["<h4>"]
 },
 {"description": "p end-tag followed by h5 start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h5", {}]],
 "expected": ["<h5>"]
 },
 {"description": "p end-tag followed by h6 start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "h6", {}]],
 "expected": ["<h6>"]
 },
 {"description": "p end-tag followed by header start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "header", {}]],
 "expected": ["<header>"]
 },
 {"description": "p end-tag followed by hr empty-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EmptyTag", "hr", {}]],
 "expected": ["<hr>"]
 },
 {"description": "p end-tag followed by menu start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "menu", {}]],
 "expected": ["<menu>"]
 },
 {"description": "p end-tag followed by nav start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "nav", {}]],
 "expected": ["<nav>"]
 },
 {"description": "p end-tag followed by ol start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ol", {}]],
 "expected": ["<ol>"]
 },
 {"description": "p end-tag followed by p start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "p", {}]],
 "expected": ["<p>"]
 },
 {"description": "p end-tag followed by pre start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}]],
 "expected": ["<pre>"]
 },
 {"description": "p end-tag followed by section start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "section", {}]],
 "expected": ["<section>"]
 },
 {"description": "p end-tag followed by table start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "table", {}]],
 "expected": ["<table>"]
 },
 {"description": "p end-tag followed by ul start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["StartTag", "http://www.w3.org/1999/xhtml", "ul", {}]],
 "expected": ["<ul>"]
 },
 {"description": "p end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "p end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "p"]],
 "expected": [""]
 },
 {"description": "optgroup end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Comment", "foo"]],
 "expected": ["</optgroup><!--foo-->"]
 },
 {"description": "optgroup end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", " foo"]],
 "expected": ["</optgroup> foo"]
 },
 {"description": "optgroup end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["Characters", "foo"]],
 "expected": ["</optgroup>foo"]
 },
 {"description": "optgroup end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</optgroup><foo>"]
 },
 {"description": "optgroup end-tag followed by optgroup start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
 "expected": ["<optgroup>"]
 },
 {"description": "optgroup end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "optgroup end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "optgroup"]],
 "expected": [""]
 },
 {"description": "option end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Comment", "foo"]],
 "expected": ["</option><!--foo-->"]
 },
 {"description": "option end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", " foo"]],
 "expected": ["</option> foo"]
 },
 {"description": "option end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["Characters", "foo"]],
 "expected": ["</option>foo"]
 },
 {"description": "option end-tag followed by optgroup start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "optgroup", {}]],
 "expected": ["<optgroup>"]
 },
 {"description": "option end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</option><foo>"]
 },
 {"description": "option end-tag followed by option start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["StartTag", "http://www.w3.org/1999/xhtml", "option", {}]],
 "expected": ["<option>"]
 },
 {"description": "option end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "option end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "option"]],
 "expected": [""]
 },
 {"description": "colgroup start-tag followed by comment",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Comment", "foo"]],
 "expected": ["<colgroup><!--foo-->"]
 },
 {"description": "colgroup start-tag followed by space character",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", " foo"]],
 "expected": ["<colgroup> foo"]
 },
 {"description": "colgroup start-tag followed by text",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["Characters", "foo"]],
 "expected": ["<colgroup>foo"]
 },
 {"description": "colgroup start-tag followed by start-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<colgroup><foo>"]
 },
 {"description": "first colgroup in a table with a col child",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EmptyTag", "col", {}]],
 "expected": ["<table><col>"]
 },
 {"description": "colgroup with a col child, following another colgroup",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "col", {}]],
 "expected": ["</colgroup><col>", "<colgroup><col>"]
 },
 {"description": "colgroup start-tag followed by end-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["<colgroup></foo>"]
 },
 {"description": "colgroup start-tag at EOF",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "colgroup", {}]],
 "expected": ["<colgroup>"]
 },
 {"description": "colgroup end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Comment", "foo"]],
 "expected": ["</colgroup><!--foo-->"]
 },
 {"description": "colgroup end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", " foo"]],
 "expected": ["</colgroup> foo"]
 },
 {"description": "colgroup end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["Characters", "foo"]],
 "expected": ["foo"]
 },
 {"description": "colgroup end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<foo>"]
 },
 {"description": "colgroup end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "colgroup end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "colgroup"]],
 "expected": [""]
 },
 {"description": "thead end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Comment", "foo"]],
 "expected": ["</thead><!--foo-->"]
 },
 {"description": "thead end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", " foo"]],
 "expected": ["</thead> foo"]
 },
 {"description": "thead end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["Characters", "foo"]],
 "expected": ["</thead>foo"]
 },
 {"description": "thead end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</thead><foo>"]
 },
 {"description": "thead end-tag followed by tbody start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
 "expected": ["<tbody>"]
 },
 {"description": "thead end-tag followed by tfoot start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
 "expected": ["<tfoot>"]
 },
 {"description": "thead end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</thead></foo>"]
 },
 {"description": "thead end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"]],
 "expected": ["</thead>"]
 },
 {"description": "tbody start-tag followed by comment",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Comment", "foo"]],
 "expected": ["<tbody><!--foo-->"]
 },
 {"description": "tbody start-tag followed by space character",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", " foo"]],
 "expected": ["<tbody> foo"]
 },
 {"description": "tbody start-tag followed by text",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["Characters", "foo"]],
 "expected": ["<tbody>foo"]
 },
 {"description": "tbody start-tag followed by start-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["<tbody><foo>"]
 },
 {"description": "first tbody in a table with a tr child",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "table", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
 "expected": ["<table><tr>"]
 },
 {"description": "tbody with a tr child, following another tbody",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
 "expected": ["<tbody><tr>", "</tbody><tr>"]
 },
 {"description": "tbody with a tr child, following a thead",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "thead"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
 "expected": ["<tbody><tr>", "</thead><tr>"]
 },
 {"description": "tbody with a tr child, following a tfoot",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
 "expected": ["<tbody><tr>", "</tfoot><tr>"]
 },
 {"description": "tbody start-tag followed by end-tag",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["<tbody></foo>"]
 },
 {"description": "tbody start-tag at EOF",
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
 "expected": ["<tbody>"]
 },
 {"description": "tbody end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Comment", "foo"]],
 "expected": ["</tbody><!--foo-->"]
 },
 {"description": "tbody end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", " foo"]],
 "expected": ["</tbody> foo"]
 },
 {"description": "tbody end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["Characters", "foo"]],
 "expected": ["</tbody>foo"]
 },
 {"description": "tbody end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</tbody><foo>"]
 },
 {"description": "tbody end-tag followed by tbody start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
 "expected": ["<tbody>", "</tbody>"]
 },
 {"description": "tbody end-tag followed by tfoot start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["StartTag", "http://www.w3.org/1999/xhtml", "tfoot", {}]],
 "expected": ["<tfoot>"]
 },
 {"description": "tbody end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "tbody end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tbody"]],
 "expected": [""]
 },
 {"description": "tfoot end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Comment", "foo"]],
 "expected": ["</tfoot><!--foo-->"]
 },
 {"description": "tfoot end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", " foo"]],
 "expected": ["</tfoot> foo"]
 },
 {"description": "tfoot end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["Characters", "foo"]],
 "expected": ["</tfoot>foo"]
 },
 {"description": "tfoot end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</tfoot><foo>"]
 },
 {"description": "tfoot end-tag followed by tbody start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["StartTag", "http://www.w3.org/1999/xhtml", "tbody", {}]],
 "expected": ["<tbody>", "</tfoot>"]
 },
 {"description": "tfoot end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "tfoot end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tfoot"]],
 "expected": [""]
 },
 {"description": "tr end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Comment", "foo"]],
 "expected": ["</tr><!--foo-->"]
 },
 {"description": "tr end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", " foo"]],
 "expected": ["</tr> foo"]
 },
 {"description": "tr end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["Characters", "foo"]],
 "expected": ["</tr>foo"]
 },
 {"description": "tr end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</tr><foo>"]
 },
 {"description": "tr end-tag followed by tr start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["StartTag", "http://www.w3.org/1999/xhtml", "tr", {}]],
 "expected": ["<tr>", "</tr>"]
 },
 {"description": "tr end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "tr end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "tr"]],
 "expected": [""]
 },
 {"description": "td end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Comment", "foo"]],
 "expected": ["</td><!--foo-->"]
 },
 {"description": "td end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", " foo"]],
 "expected": ["</td> foo"]
 },
 {"description": "td end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["Characters", "foo"]],
 "expected": ["</td>foo"]
 },
 {"description": "td end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</td><foo>"]
 },
 {"description": "td end-tag followed by td start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
 "expected": ["<td>", "</td>"]
 },
 {"description": "td end-tag followed by th start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
 "expected": ["<th>", "</td>"]
 },
 {"description": "td end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "td end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "td"]],
 "expected": [""]
 },
 {"description": "th end-tag followed by comment",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Comment", "foo"]],
 "expected": ["</th><!--foo-->"]
 },
 {"description": "th end-tag followed by space character",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", " foo"]],
 "expected": ["</th> foo"]
 },
 {"description": "th end-tag followed by text",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["Characters", "foo"]],
 "expected": ["</th>foo"]
 },
 {"description": "th end-tag followed by start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "foo", {}]],
 "expected": ["</th><foo>"]
 },
 {"description": "th end-tag followed by th start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "th", {}]],
 "expected": ["<th>", "</th>"]
 },
 {"description": "th end-tag followed by td start-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["StartTag", "http://www.w3.org/1999/xhtml", "td", {}]],
 "expected": ["<td>", "</th>"]
 },
 {"description": "th end-tag followed by end-tag",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml", "th"], ["EndTag", "http://www.w3.org/1999/xhtml", "foo"]],
 "expected": ["</foo>"]
 },
 {"description": "th end-tag at EOF",
 "input": [["EndTag", "http://www.w3.org/1999/xhtml"    , "th"]],
 "expected": [""]
 }
 ]}
--- a/lib/html5lib/tests/testdata/serializer/options.test
+++ b/lib/html5lib/tests/testdata/serializer/options.test
@ -1,60 +0,0 @@
 {"tests":[
 {"description": "quote_char=\"'\"",
 "options": {"quote_char": "'"},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": null, "name": "title", "value": "test 'with' quote_char"}]]],
 "expected": ["<span title='test &#39;with&#39; quote_char'>"]
 },
 {"description": "quote_attr_values=true",
 "options": {"quote_attr_values": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "button", [{"namespace": null, "name": "disabled", "value" :"disabled"}]]],
 "expected": ["<button disabled>"],
 "xhtml":    ["<button disabled=\"disabled\">"]
 },
 {"description": "quote_attr_values=true with irrelevant",
 "options": {"quote_attr_values": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
 "expected": ["<div irrelevant>"],
 "xhtml":    ["<div irrelevant=\"irrelevant\">"]
 },
 {"description": "use_trailing_solidus=true with void element",
 "options": {"use_trailing_solidus": true},
 "input": [["EmptyTag", "img", {}]],
 "expected": ["<img />"]
 },
 {"description": "use_trailing_solidus=true with non-void element",
 "options": {"use_trailing_solidus": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", {}]],
 "expected": ["<div>"]
 },
 {"description": "minimize_boolean_attributes=false",
 "options": {"minimize_boolean_attributes": false},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :"irrelevant"}]]],
 "expected": ["<div irrelevant=irrelevant>"],
 "xhtml":    ["<div irrelevant=\"irrelevant\">"]
 },
 {"description": "minimize_boolean_attributes=false with empty value",
 "options": {"minimize_boolean_attributes": false},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "div", [{"namespace": null, "name": "irrelevant", "value" :""}]]],
 "expected": ["<div irrelevant=\"\">"]
 },
 {"description": "escape less than signs in attribute values",
 "options": {"escape_lt_in_attrs": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "a", [{"namespace": null, "name": "title", "value": "a<b>c&d"}]]],
 "expected": ["<a title=\"a&lt;b>c&amp;d\">"]
 },
 {"description": "rcdata",
 "options": {"escape_rcdata": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "a<b>c&d"]],
 "expected": ["<script>a&lt;b&gt;c&amp;d"]
 }
 ]}
--- a/lib/html5lib/tests/testdata/serializer/whitespace.test
+++ b/lib/html5lib/tests/testdata/serializer/whitespace.test
@ -1,51 +0,0 @@
 {"tests": [
 {"description": "bare text with leading spaces",
 "options": {"strip_whitespace": true},
 "input": [["Characters", "\t\r\n\u000C foo"]],
 "expected": [" foo"]
 },
 {"description": "bare text with trailing spaces",
 "options": {"strip_whitespace": true},
 "input": [["Characters", "foo \t\r\n\u000C"]],
 "expected": ["foo "]
 },
 {"description": "bare text with inner spaces",
 "options": {"strip_whitespace": true},
 "input": [["Characters", "foo \t\r\n\u000C bar"]],
 "expected": ["foo bar"]
 },
 {"description": "text within <pre>",
 "options": {"strip_whitespace": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
 "expected": ["<pre>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</pre>"]
 },
 {"description": "text within <pre>, with inner markup",
 "options": {"strip_whitespace": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "pre", {}], ["Characters", "\t\r\n\u000C fo"], ["StartTag", "http://www.w3.org/1999/xhtml", "span", {}], ["Characters", "o \t\r\n\u000C b"], ["EndTag", "http://www.w3.org/1999/xhtml", "span"], ["Characters", "ar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "pre"]],
 "expected": ["<pre>\t\r\n\u000C fo<span>o \t\r\n\u000C b</span>ar \t\r\n\u000C</pre>"]
 },
 {"description": "text within <textarea>",
 "options": {"strip_whitespace": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "textarea", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "textarea"]],
 "expected": ["<textarea>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</textarea>"]
 },
 {"description": "text within <script>",
 "options": {"strip_whitespace": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "script", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "script"]],
 "expected": ["<script>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</script>"]
 },
 {"description": "text within <style>",
 "options": {"strip_whitespace": true},
 "input": [["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C"], ["EndTag", "http://www.w3.org/1999/xhtml", "style"]],
 "expected": ["<style>\t\r\n\u000C foo \t\r\n\u000C bar \t\r\n\u000C</style>"]
 }
 ]}
--- a/lib/html5lib/tests/testdata/sniffer/htmlOrFeed.json
+++ b/lib/html5lib/tests/testdata/sniffer/htmlOrFeed.json
@ -1,43 +0,0 @@
 [
    {"type": "text/html", "input": ""},
    {"type": "text/html", "input": "<!---->"},
    {"type": "text/html", "input": "<!--asdfaslkjdf;laksjdf as;dkfjsd-->"},
    {"type": "text/html", "input": "<!"},
    {"type": "text/html", "input": "\t"},
    {"type": "text/html", "input": "<!>"},
    {"type": "text/html", "input": "<?"},
    {"type": "text/html", "input": "<??>"},
    {"type": "application/rss+xml", "input": "<rss"},
    {"type": "application/atom+xml", "input": "<feed"},
    {"type": "text/html", "input": "<html"},
    {"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n<html><head>\n<title>302 Found</title>\n</head><body>\n<h1>Found</h1>\n<p>The document has moved <a href=\"http://feeds.feedburner.com/gofug\">here</a>.</p>\n</body></html>\n"},
    {"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\r\n<HTML><HEAD>\r\n   <link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/289619328/feed.css\" /><link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/431602649/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/382549546/feed.css\" />\r\n<link rel=\"stylesheet\" type=\"text/css\" href=\"http://cache.blogads.com/314618017/feed.css\" /><META http-equiv=\"expires\" content="},
    {"type": "text/html", "input": "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\r\n<html>\r\n<head>\r\n<title>Xiaxue - Chicken pie blogger.</title><meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\"><style type=\"text/css\">\r\n<style type=\"text/css\">\r\n<!--\r\nbody {\r\n background-color: #FFF2F2;\r\n}\r\n.style1 {font-family: Georgia, \"Times New Roman\", Times, serif}\r\n.style2 {\r\n color: #8a567c;\r\n font-size: 14px;\r\n font-family: Georgia, \"Times New Roman\", Times, serif;\r\n}\r"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head> \r\n<title>Google Operating System</title>\r\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"Description\" content=\"Unofficial news and tips about Google. A blog that watches Google's latest developments and the attempts to move your operating system online.\" />\r\n<meta name=\"generator\" c"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n  <title>Assimilated Press</title>  <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Assimilated Press - Atom\" href=\"http://assimila"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<head>\r\n  <title>PostSecret</title>\r\n<META name=\"keywords\" Content=\"secrets, postcard, secret, postcards, postsecret, postsecrets,online confessional, post secret, post secrets, artomatic, post a secret\"><META name=\"discription\" Content=\"See a Secret...Share a Secret\">  <meta http-equiv=\"Content-Type\" content=\"te"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns='http://www.w3.org/1999/xhtml' xmlns:b='http://www.google.com/2005/gml/b' xmlns:data='http://www.google.com/2005/gml/data' xmlns:expr='http://www.google.com/2005/gml/expr'>\n  <head>\n    \n  <meta content='text/html; charset=UTF-8' http-equiv='Content-Type'/>\n  <meta content='true' name='MSSmartTagsPreventParsing'/>\n  <meta content='blogger' name='generator'/>\n  <link rel=\"alternate\" typ"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\">\n<head profile=\"http://gmpg.org/xfn/11\"> \n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />  \n<title> CMS Lever</title><link rel=\"stylesheet\" type=\"text/css\" media=\"screen\" href=\"http://s.wordpress.com/wp-content/themes/pub/twenty-eight/2813.css\"/>\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" h"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> Park Avenue Peerage</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://parkavenuepeerage.wordpress.com/feed/\" />\t<link rel=\"pingback\" href="},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"ja\"><head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n<title> \u884c\u96f2\u6d41\u6c34 -like a floating clouds and running water-</title>\t<meta name=\"generator\" content=\"WordPress.com\" />\t<!-- feeds -->\n\t<link rel=\"alternate\" type=\"application/rss+xml\" title=\"RSS 2.0\" href=\"http://shw4.wordpress.com/feed/\" />\t<li"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Go Fug Yourself</title><link rel=\"stylesheet\" href=\"http://gofugyourself.typepad.com/go_fug_yourself/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Atom\" "},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\"><head profile=\"http://gmpg.org/xfn/11\">\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" /><title> Ladies&#8230;</title><meta name=\"generator\" content=\"WordPress.com\" /> <!-- leave this for stats --><link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/default/style.css?1\" type=\"tex"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\r\n<html xmlns=\"http://www.w3.org/1999/xhtml\">\r\n<head>\r\n  <title>The Sartorialist</title>  <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"The Sartorialist - Atom\" href=\"http://thesartorialist.blogspot"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n     \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html  xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\" />\n<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n<title>Creating Passionate Users</title><link rel=\"stylesheet\" href=\"http://headrush.typepad.com/creating_passionate_users/styles.css\" type=\"text/css\" />\n<link rel=\"alternate\" type"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n    <meta name=\"keywords\" content=\"marketing, blog, seth, ideas, respect, permission\" />\n    <meta name=\"description\" content=\"Seth Godin's riffs on marketing, respect, and the "},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" id=\"sixapart-standard\">\n<head>\n\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />\n\t<meta name=\"generator\" content=\"http://www.typepad.com/\" />\n\t\n\t\n    \n    <meta name=\"description\" content=\" Western Civilization hangs in the balance. This blog is part of the solution,the cure. Get your heads out of the sand and Fight the G"},
    {"type": "text/html", "input": "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n<html xmlns=\"http://www.w3.org/1999/xhtml\" dir=\"ltr\" lang=\"en\">\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\" />\n<title> From Under the Rotunda</title>\n<link rel=\"stylesheet\" href=\"http://s.wordpress.com/wp-content/themes/pub/andreas04/style.css\" type=\"text/css\""},
    {"type": "application/atom+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href=\"http://www.blogger.com/styles/atom.css\" type=\"text/css\"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/'><id>tag:blogger.com,1999:blog-10861780</id><updated>2007-07-27T12:38:50.888-07:00</updated><title type='text'>Official Google Blog</title><link rel='alternate' type='text/html' href='http://googleblog.blogspot.com/'/><link rel='next' type='application/atom+xml' href='http://googleblog.blogs"},
    {"type": "application/rss+xml", "input": "<?xml version='1.0' encoding='UTF-8'?><rss xmlns:atom='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' version='2.0'><channel><atom:id>tag:blogger.com,1999:blog-10861780</atom:id><lastBuildDate>Fri, 27 Jul 2007 19:38:50 +0000</lastBuildDate><title>Official Google Blog</title><description/><link>http://googleblog.blogspot.com/</link><managingEditor>Eric Case</managingEditor><generator>Blogger</generator><openSearch:totalResults>729</openSearch:totalResults><openSearc"},
    {"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"pahrefhttpwwwfeedburnercomtarget_blankimgsrchttpwwwfeedburnercomfbimagespubpowered_by_fbgifaltPoweredbyFeedBurnerstyleborder0ap\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>From Under the Rotunda</title>\n\t<link>http://dannybernardi.wordpress.com</link>\n\t<description>The Monographs of Danny Ber"},
    {"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>CMS Lever</title>\n\t<link>http://kanaguri.wordpress.com</link>\n\t<description>CMS\u306e\u6c17\u306b\u306a\u3063\u305f\u3053\u3068</description>\n\t<pubDate>Wed, 18 Jul 2007 21:26:22 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>ja</languag"},
    {"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\">\n    <title>Atlas Shrugs</title>\n    <link rel=\"self\" type=\"application/atom+xml\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/atom.xml\" />\n    <link rel=\"alternate\" type=\"text/html\" href=\"http://atlasshrugs2000.typepad.com/atlas_shrugs/\" />\n    <id>tag:typepad.com,2003:weblog-132946</id>\n    <updated>2007-08-15T16:07:34-04"},
    {"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:thr=\"http://purl.org/syndication/thread/1.0\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n    <title>Creating Passionate Users</title>\r\n  "},
    {"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\">\r\n    <title>Seth's Blog</title>\r\n    <link rel=\"alternate\" type=\"text/html\" href=\"http://sethgodin.typepad.com/seths_blog/\" />\r\n    <link rel=\"s"},
    {"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atom10full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://www.w3.org/2005/Atom\" xmlns:openSearch=\"http://a9.com/-/spec/opensearchrss/1.0/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\"><id>tag:blogger.com,1999:blog-32454861</id><updated>2007-07-31T21:44:09.867+02:00</upd"},
    {"type": "application/atom+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/atomfull.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><feed xmlns=\"http://purl.org/atom/ns#\" xmlns:dc=\"http://purl.org/dc/elements/1.1/\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"0.3\">\r\n  <title>Go Fug Yourself</title>\r\n  <link rel=\"alternate\" type=\"text/html\" href=\"http://go"},
    {"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n<?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/rss2full.xsl\" type=\"text/xsl\" media=\"screen\"?><?xml-stylesheet href=\"http://feeds.feedburner.com/~d/styles/itemcontent.css\" type=\"text/css\" media=\"screen\"?><rss xmlns:creativeCommons=\"http://backend.userland.com/creativeCommonsRssModule\" xmlns:feedburner=\"http://rssnamespace.org/feedburner/ext/1.0\" version=\"2.0\"><channel><title>Google Operating System</title><link>http://googlesystem.blogspot.com/</link>"},
    {"type": "application/rss+xml", "input": "<?xml version=\"1.0\" encoding=\"\"?>\n<!-- generator=\"wordpress/MU\" -->\n<rss version=\"2.0\"\n\txmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n\txmlns:wfw=\"http://wellformedweb.org/CommentAPI/\"\n\txmlns:dc=\"http://purl.org/dc/elements/1.1/\"\n\t><channel>\n\t<title>Nunublog</title>\n\t<link>http://nunubh.wordpress.com</link>\n\t<description>Just Newbie Blog!</description>\n\t<pubDate>Mon, 09 Jul 2007 18:54:09 +0000</pubDate>\n\t<generator>http://wordpress.org/?v=MU</generator>\n\t<language>id</language>\n\t\t\t<item>\n\t\t<ti"},
    {"type": "text/html", "input": "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\r\n<HEAD>\r\n<TITLE>Design*Sponge</TITLE><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\r\n<meta name=\"MSSmartTagsPreventParsing\" content=\"true\" />\r\n<meta name=\"generator\" content=\"Blogger\" />\r\n<link rel=\"alternate\" type=\"application/atom+xml\" title=\"Design*Sponge - Atom\" href=\"http://designsponge.blogspot.com/feeds/posts/default\" />\r\n<link rel=\"alternate\" type=\"application/rss+xml\" title=\"Design*Sponge - RSS\" href="},
    {"type": "text/html", "input": "<HTML>\n<HEAD>\n<TITLE>Moved Temporarily</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"#FFFFFF\" TEXT=\"#000000\">\n<H1>Moved Temporarily</H1>\nThe document has moved <A HREF=\"http://feeds.feedburner.com/thesecretdiaryofstevejobs\">here</A>.\n</BODY>\n</HTML>\n"}
 ]
--- a/lib/html5lib/tests/testdata/tokenizer/contentModelFlags.test
+++ b/lib/html5lib/tests/testdata/tokenizer/contentModelFlags.test
@ -1,75 +0,0 @@
 {"tests": [
 {"description":"PLAINTEXT content model flag",
 "initialStates":["PLAINTEXT state"],
 "lastStartTag":"plaintext",
 "input":"<head>&body;",
 "output":[["Character", "<head>&body;"]]},
 {"description":"End tag closing RCDATA or RAWTEXT",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp>",
 "output":[["Character", "foo"], ["EndTag", "xmp"]]},
 {"description":"End tag closing RCDATA or RAWTEXT (case-insensitivity)",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xMp>",
 "output":[["Character", "foo"], ["EndTag", "xmp"]]},
 {"description":"End tag closing RCDATA or RAWTEXT (ending with space)",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp ",
 "output":[["Character", "foo"], "ParseError"]},
 {"description":"End tag closing RCDATA or RAWTEXT (ending with EOF)",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp",
 "output":[["Character", "foo</xmp"]]},
 {"description":"End tag closing RCDATA or RAWTEXT (ending with slash)",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp/",
 "output":[["Character", "foo"], "ParseError"]},
 {"description":"End tag not closing RCDATA or RAWTEXT (ending with left-angle-bracket)",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp<",
 "output":[["Character", "foo</xmp<"]]},
 {"description":"End tag with incorrect name in RCDATA or RAWTEXT",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"</foo>bar</xmp>",
 "output":[["Character", "</foo>bar"], ["EndTag", "xmp"]]},
 {"description":"End tag with incorrect name in RCDATA or RAWTEXT (starting like correct name)",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"</foo>bar</xmpaar>",
 "output":[["Character", "</foo>bar</xmpaar>"]]},
 {"description":"End tag closing RCDATA or RAWTEXT, switching back to PCDATA",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo</xmp></baz>",
 "output":[["Character", "foo"], ["EndTag", "xmp"], ["EndTag", "baz"]]},
 {"description":"RAWTEXT w/ something looking like an entity",
 "initialStates":["RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"&foo;",
 "output":[["Character", "&foo;"]]},
 {"description":"RCDATA w/ an entity",
 "initialStates":["RCDATA state"],
 "lastStartTag":"textarea",
 "input":"&lt;",
 "output":[["Character", "<"]]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/domjs.test
+++ b/lib/html5lib/tests/testdata/tokenizer/domjs.test
@ -1,90 +0,0 @@
 {
    "tests": [
        {
            "description":"CR in bogus comment state",
            "input":"<?\u000d",
            "output":["ParseError", ["Comment", "?\u000a"]]
        },
        {
            "description":"CRLF in bogus comment state",
            "input":"<?\u000d\u000a",
            "output":["ParseError", ["Comment", "?\u000a"]]
        },
        {
            "description":"NUL in RCDATA and RAWTEXT",
            "doubleEscaped":true,
            "initialStates":["RCDATA state", "RAWTEXT state"],
            "input":"\\u0000",
            "output":["ParseError", ["Character", "\\uFFFD"]]
        },
        {
            "description":"skip first BOM but not later ones",
            "input":"\uFEFFfoo\uFEFFbar",
            "output":[["Character", "foo\uFEFFbar"]]
        },
        {
            "description":"Non BMP-charref in in RCDATA",
            "initialStates":["RCDATA state"],
            "input":"&NotEqualTilde;",
            "output":[["Character", "\u2242\u0338"]]
        },
        {
            "description":"Bad charref in in RCDATA",
            "initialStates":["RCDATA state"],
            "input":"&NotEqualTild;",
            "output":["ParseError", ["Character", "&NotEqualTild;"]]
        },
        {
            "description":"lowercase endtags in RCDATA and RAWTEXT",
            "initialStates":["RCDATA state", "RAWTEXT state"],
            "lastStartTag":"xmp",
            "input":"</XMP>",
            "output":[["EndTag","xmp"]]
        },
        {
            "description":"bad endtag in RCDATA and RAWTEXT",
            "initialStates":["RCDATA state", "RAWTEXT state"],
            "lastStartTag":"xmp",
            "input":"</ XMP>",
            "output":[["Character","</ XMP>"]]
        },
        {
            "description":"bad endtag in RCDATA and RAWTEXT",
            "initialStates":["RCDATA state", "RAWTEXT state"],
            "lastStartTag":"xmp",
            "input":"</xm>",
            "output":[["Character","</xm>"]]
        },
        {
            "description":"bad endtag in RCDATA and RAWTEXT",
            "initialStates":["RCDATA state", "RAWTEXT state"],
            "lastStartTag":"xmp",
            "input":"</xm ",
            "output":[["Character","</xm "]]
        },
        {
            "description":"bad endtag in RCDATA and RAWTEXT",
            "initialStates":["RCDATA state", "RAWTEXT state"],
            "lastStartTag":"xmp",
            "input":"</xm/",
            "output":[["Character","</xm/"]]
        },
        {
            "description":"Non BMP-charref in attribute",
            "input":"<p id=\"&NotEqualTilde;\">",
            "output":[["StartTag", "p", {"id":"\u2242\u0338"}]]
        },
        {
            "description":"--!NUL in comment ",
            "doubleEscaped":true,
            "input":"<!----!\\u0000-->",
            "output":["ParseError", ["Comment", "--!\\uFFFD"]]
        },
        {
            "description":"space EOF after doctype ",
            "input":"<!DOCTYPE html ",
            "output":["ParseError", ["DOCTYPE", "html", null, null , false]]
        }
    ]
 }
--- a/lib/html5lib/tests/testdata/tokenizer/entities.test
+++ b/lib/html5lib/tests/testdata/tokenizer/entities.test
@ -1,283 +0,0 @@
 {"tests": [
 {"description": "Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.",
 "input":"<h a='&noti;'>",
 "output": ["ParseError", ["StartTag", "h", {"a": "&noti;"}]]},
 {"description": "Entity name followed by the equals sign in an attribute value.",
 "input":"<h a='&lang='>",
 "output": ["ParseError", ["StartTag", "h", {"a": "&lang="}]]},
 {"description": "CR as numeric entity",
 "input":"&#013;",
 "output": ["ParseError", ["Character", "\r"]]},
 {"description": "CR as hexadecimal numeric entity",
 "input":"&#x00D;",
 "output": ["ParseError", ["Character", "\r"]]},
 {"description": "Windows-1252 EURO SIGN numeric entity.",
 "input":"&#0128;",
 "output": ["ParseError", ["Character", "\u20AC"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
 "input":"&#0129;",
 "output": ["ParseError", ["Character", "\u0081"]]},
 {"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK numeric entity.",
 "input":"&#0130;",
 "output": ["ParseError", ["Character", "\u201A"]]},
 {"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK numeric entity.",
 "input":"&#0131;",
 "output": ["ParseError", ["Character", "\u0192"]]},
 {"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK numeric entity.",
 "input":"&#0132;",
 "output": ["ParseError", ["Character", "\u201E"]]},
 {"description": "Windows-1252 HORIZONTAL ELLIPSIS numeric entity.",
 "input":"&#0133;",
 "output": ["ParseError", ["Character", "\u2026"]]},
 {"description": "Windows-1252 DAGGER numeric entity.",
 "input":"&#0134;",
 "output": ["ParseError", ["Character", "\u2020"]]},
 {"description": "Windows-1252 DOUBLE DAGGER numeric entity.",
 "input":"&#0135;",
 "output": ["ParseError", ["Character", "\u2021"]]},
 {"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT numeric entity.",
 "input":"&#0136;",
 "output": ["ParseError", ["Character", "\u02C6"]]},
 {"description": "Windows-1252 PER MILLE SIGN numeric entity.",
 "input":"&#0137;",
 "output": ["ParseError", ["Character", "\u2030"]]},
 {"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON numeric entity.",
 "input":"&#0138;",
 "output": ["ParseError", ["Character", "\u0160"]]},
 {"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK numeric entity.",
 "input":"&#0139;",
 "output": ["ParseError", ["Character", "\u2039"]]},
 {"description": "Windows-1252 LATIN CAPITAL LIGATURE OE numeric entity.",
 "input":"&#0140;",
 "output": ["ParseError", ["Character", "\u0152"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
 "input":"&#0141;",
 "output": ["ParseError", ["Character", "\u008D"]]},
 {"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON numeric entity.",
 "input":"&#0142;",
 "output": ["ParseError", ["Character", "\u017D"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
 "input":"&#0143;",
 "output": ["ParseError", ["Character", "\u008F"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
 "input":"&#0144;",
 "output": ["ParseError", ["Character", "\u0090"]]},
 {"description": "Windows-1252 LEFT SINGLE QUOTATION MARK numeric entity.",
 "input":"&#0145;",
 "output": ["ParseError", ["Character", "\u2018"]]},
 {"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK numeric entity.",
 "input":"&#0146;",
 "output": ["ParseError", ["Character", "\u2019"]]},
 {"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK numeric entity.",
 "input":"&#0147;",
 "output": ["ParseError", ["Character", "\u201C"]]},
 {"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK numeric entity.",
 "input":"&#0148;",
 "output": ["ParseError", ["Character", "\u201D"]]},
 {"description": "Windows-1252 BULLET numeric entity.",
 "input":"&#0149;",
 "output": ["ParseError", ["Character", "\u2022"]]},
 {"description": "Windows-1252 EN DASH numeric entity.",
 "input":"&#0150;",
 "output": ["ParseError", ["Character", "\u2013"]]},
 {"description": "Windows-1252 EM DASH numeric entity.",
 "input":"&#0151;",
 "output": ["ParseError", ["Character", "\u2014"]]},
 {"description": "Windows-1252 SMALL TILDE numeric entity.",
 "input":"&#0152;",
 "output": ["ParseError", ["Character", "\u02DC"]]},
 {"description": "Windows-1252 TRADE MARK SIGN numeric entity.",
 "input":"&#0153;",
 "output": ["ParseError", ["Character", "\u2122"]]},
 {"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON numeric entity.",
 "input":"&#0154;",
 "output": ["ParseError", ["Character", "\u0161"]]},
 {"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK numeric entity.",
 "input":"&#0155;",
 "output": ["ParseError", ["Character", "\u203A"]]},
 {"description": "Windows-1252 LATIN SMALL LIGATURE OE numeric entity.",
 "input":"&#0156;",
 "output": ["ParseError", ["Character", "\u0153"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR numeric entity.",
 "input":"&#0157;",
 "output": ["ParseError", ["Character", "\u009D"]]},
 {"description": "Windows-1252 EURO SIGN hexadecimal numeric entity.",
 "input":"&#x080;",
 "output": ["ParseError", ["Character", "\u20AC"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
 "input":"&#x081;",
 "output": ["ParseError", ["Character", "\u0081"]]},
 {"description": "Windows-1252 SINGLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x082;",
 "output": ["ParseError", ["Character", "\u201A"]]},
 {"description": "Windows-1252 LATIN SMALL LETTER F WITH HOOK hexadecimal numeric entity.",
 "input":"&#x083;",
 "output": ["ParseError", ["Character", "\u0192"]]},
 {"description": "Windows-1252 DOUBLE LOW-9 QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x084;",
 "output": ["ParseError", ["Character", "\u201E"]]},
 {"description": "Windows-1252 HORIZONTAL ELLIPSIS hexadecimal numeric entity.",
 "input":"&#x085;",
 "output": ["ParseError", ["Character", "\u2026"]]},
 {"description": "Windows-1252 DAGGER hexadecimal numeric entity.",
 "input":"&#x086;",
 "output": ["ParseError", ["Character", "\u2020"]]},
 {"description": "Windows-1252 DOUBLE DAGGER hexadecimal numeric entity.",
 "input":"&#x087;",
 "output": ["ParseError", ["Character", "\u2021"]]},
 {"description": "Windows-1252 MODIFIER LETTER CIRCUMFLEX ACCENT hexadecimal numeric entity.",
 "input":"&#x088;",
 "output": ["ParseError", ["Character", "\u02C6"]]},
 {"description": "Windows-1252 PER MILLE SIGN hexadecimal numeric entity.",
 "input":"&#x089;",
 "output": ["ParseError", ["Character", "\u2030"]]},
 {"description": "Windows-1252 LATIN CAPITAL LETTER S WITH CARON hexadecimal numeric entity.",
 "input":"&#x08A;",
 "output": ["ParseError", ["Character", "\u0160"]]},
 {"description": "Windows-1252 SINGLE LEFT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x08B;",
 "output": ["ParseError", ["Character", "\u2039"]]},
 {"description": "Windows-1252 LATIN CAPITAL LIGATURE OE hexadecimal numeric entity.",
 "input":"&#x08C;",
 "output": ["ParseError", ["Character", "\u0152"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
 "input":"&#x08D;",
 "output": ["ParseError", ["Character", "\u008D"]]},
 {"description": "Windows-1252 LATIN CAPITAL LETTER Z WITH CARON hexadecimal numeric entity.",
 "input":"&#x08E;",
 "output": ["ParseError", ["Character", "\u017D"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
 "input":"&#x08F;",
 "output": ["ParseError", ["Character", "\u008F"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
 "input":"&#x090;",
 "output": ["ParseError", ["Character", "\u0090"]]},
 {"description": "Windows-1252 LEFT SINGLE QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x091;",
 "output": ["ParseError", ["Character", "\u2018"]]},
 {"description": "Windows-1252 RIGHT SINGLE QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x092;",
 "output": ["ParseError", ["Character", "\u2019"]]},
 {"description": "Windows-1252 LEFT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x093;",
 "output": ["ParseError", ["Character", "\u201C"]]},
 {"description": "Windows-1252 RIGHT DOUBLE QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x094;",
 "output": ["ParseError", ["Character", "\u201D"]]},
 {"description": "Windows-1252 BULLET hexadecimal numeric entity.",
 "input":"&#x095;",
 "output": ["ParseError", ["Character", "\u2022"]]},
 {"description": "Windows-1252 EN DASH hexadecimal numeric entity.",
 "input":"&#x096;",
 "output": ["ParseError", ["Character", "\u2013"]]},
 {"description": "Windows-1252 EM DASH hexadecimal numeric entity.",
 "input":"&#x097;",
 "output": ["ParseError", ["Character", "\u2014"]]},
 {"description": "Windows-1252 SMALL TILDE hexadecimal numeric entity.",
 "input":"&#x098;",
 "output": ["ParseError", ["Character", "\u02DC"]]},
 {"description": "Windows-1252 TRADE MARK SIGN hexadecimal numeric entity.",
 "input":"&#x099;",
 "output": ["ParseError", ["Character", "\u2122"]]},
 {"description": "Windows-1252 LATIN SMALL LETTER S WITH CARON hexadecimal numeric entity.",
 "input":"&#x09A;",
 "output": ["ParseError", ["Character", "\u0161"]]},
 {"description": "Windows-1252 SINGLE RIGHT-POINTING ANGLE QUOTATION MARK hexadecimal numeric entity.",
 "input":"&#x09B;",
 "output": ["ParseError", ["Character", "\u203A"]]},
 {"description": "Windows-1252 LATIN SMALL LIGATURE OE hexadecimal numeric entity.",
 "input":"&#x09C;",
 "output": ["ParseError", ["Character", "\u0153"]]},
 {"description": "Windows-1252 REPLACEMENT CHAR hexadecimal numeric entity.",
 "input":"&#x09D;",
 "output": ["ParseError", ["Character", "\u009D"]]},
 {"description": "Windows-1252 LATIN SMALL LETTER Z WITH CARON hexadecimal numeric entity.",
 "input":"&#x09E;",
 "output": ["ParseError", ["Character", "\u017E"]]},
 {"description": "Windows-1252 LATIN CAPITAL LETTER Y WITH DIAERESIS hexadecimal numeric entity.",
 "input":"&#x09F;",
 "output": ["ParseError", ["Character", "\u0178"]]},
 {"description": "Decimal numeric entity followed by hex character a.",
 "input":"&#97a",
 "output": ["ParseError", ["Character", "aa"]]},
 {"description": "Decimal numeric entity followed by hex character A.",
 "input":"&#97A",
 "output": ["ParseError", ["Character", "aA"]]},
 {"description": "Decimal numeric entity followed by hex character f.",
 "input":"&#97f",
 "output": ["ParseError", ["Character", "af"]]},
 {"description": "Decimal numeric entity followed by hex character A.",
 "input":"&#97F",
 "output": ["ParseError", ["Character", "aF"]]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/escapeFlag.test
+++ b/lib/html5lib/tests/testdata/tokenizer/escapeFlag.test
@ -1,33 +0,0 @@
 {"tests": [
 {"description":"Commented close tag in RCDATA or RAWTEXT",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--</xmp>--></xmp>",
 "output":[["Character", "foo<!--"], ["EndTag", "xmp"], ["Character", "-->"], ["EndTag", "xmp"]]},
 {"description":"Bogus comment in RCDATA or RAWTEXT",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-->baz</xmp>",
 "output":[["Character", "foo<!-->baz"], ["EndTag", "xmp"]]},
 {"description":"End tag surrounded by bogus comment in RCDATA or RAWTEXT",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!--></xmp><!-->baz</xmp>",
 "output":[["Character", "foo<!-->"], ["EndTag", "xmp"], "ParseError", ["Comment", ""], ["Character", "baz"], ["EndTag", "xmp"]]},
 {"description":"Commented entities in RCDATA",
 "initialStates":["RCDATA state"],
 "lastStartTag":"xmp",
 "input":" &amp; <!-- &amp; --> &amp; </xmp>",
 "output":[["Character", " & <!-- & --> & "], ["EndTag", "xmp"]]},
 {"description":"Incorrect comment ending sequences in RCDATA or RAWTEXT",
 "initialStates":["RCDATA state", "RAWTEXT state"],
 "lastStartTag":"xmp",
 "input":"foo<!-- x --x>x-- >x--!>x--<></xmp>",
 "output":[["Character", "foo<!-- x --x>x-- >x--!>x--<>"], ["EndTag", "xmp"]]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/namedEntities.test
+++ b/lib/html5lib/tests/testdata/tokenizer/namedEntities.test
--- a/lib/html5lib/tests/testdata/tokenizer/numericEntities.test
+++ b/lib/html5lib/tests/testdata/tokenizer/numericEntities.test
--- a/lib/html5lib/tests/testdata/tokenizer/pendingSpecChanges.test
+++ b/lib/html5lib/tests/testdata/tokenizer/pendingSpecChanges.test
@ -1,7 +0,0 @@
 {"tests": [
 {"description":"<!---- >",
 "input":"<!---- >",
 "output":["ParseError", "ParseError", ["Comment","-- >"]]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/test1.test
+++ b/lib/html5lib/tests/testdata/tokenizer/test1.test
@ -1,196 +0,0 @@
 {"tests": [
 {"description":"Correct Doctype lowercase",
 "input":"<!DOCTYPE html>",
 "output":[["DOCTYPE", "html", null, null, true]]},
 {"description":"Correct Doctype uppercase",
 "input":"<!DOCTYPE HTML>",
 "output":[["DOCTYPE", "html", null, null, true]]},
 {"description":"Correct Doctype mixed case",
 "input":"<!DOCTYPE HtMl>", 
 "output":[["DOCTYPE", "html", null, null, true]]},
 {"description":"Correct Doctype case with EOF",
 "input":"<!DOCTYPE HtMl", 
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"Truncated doctype start",
 "input":"<!DOC>", 
 "output":["ParseError", ["Comment", "DOC"]]},
 {"description":"Doctype in error",
 "input":"<!DOCTYPE foo>", 
 "output":[["DOCTYPE", "foo", null, null, true]]},
 {"description":"Single Start Tag",
 "input":"<h>",
 "output":[["StartTag", "h", {}]]},
 {"description":"Empty end tag",
 "input":"</>",
 "output":["ParseError"]},
 {"description":"Empty start tag",
 "input":"<>",
 "output":["ParseError", ["Character", "<>"]]},
 {"description":"Start Tag w/attribute",
 "input":"<h a='b'>",
 "output":[["StartTag", "h", {"a":"b"}]]},
 {"description":"Start Tag w/attribute no quotes",
 "input":"<h a=b>",
 "output":[["StartTag", "h", {"a":"b"}]]},
 {"description":"Start/End Tag",
 "input":"<h></h>",
 "output":[["StartTag", "h", {}], ["EndTag", "h"]]},
 {"description":"Two unclosed start tags",
 "input":"<p>One<p>Two",
 "output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
 {"description":"End Tag w/attribute",
 "input":"<h></h a='b'>",
 "output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
 {"description":"Multiple atts",
 "input":"<h a='b' c='d'>",
 "output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
 {"description":"Multiple atts no space",
 "input":"<h a='b'c='d'>",
 "output":["ParseError", ["StartTag", "h", {"a":"b", "c":"d"}]]},
 {"description":"Repeated attr",
 "input":"<h a='b' a='d'>",
 "output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
 {"description":"Simple comment",
 "input":"<!--comment-->",
 "output":[["Comment", "comment"]]},
 {"description":"Comment, Central dash no space",
 "input":"<!----->",
 "output":["ParseError", ["Comment", "-"]]},
 {"description":"Comment, two central dashes",
 "input":"<!-- --comment -->",
 "output":["ParseError", ["Comment", " --comment "]]},
 {"description":"Unfinished comment",
 "input":"<!--comment",
 "output":["ParseError", ["Comment", "comment"]]},
 {"description":"Start of a comment",
 "input":"<!-",
 "output":["ParseError", ["Comment", "-"]]},
 {"description":"Short comment",
 "input":"<!-->",
 "output":["ParseError", ["Comment", ""]]},
 {"description":"Short comment two",
 "input":"<!--->",
 "output":["ParseError", ["Comment", ""]]},
 {"description":"Short comment three",
 "input":"<!---->",
 "output":[["Comment", ""]]},
 {"description":"Ampersand EOF",
 "input":"&",
 "output":[["Character", "&"]]},
 {"description":"Ampersand ampersand EOF",
 "input":"&&",
 "output":[["Character", "&&"]]},
 {"description":"Ampersand space EOF",
 "input":"& ",
 "output":[["Character", "& "]]},
 {"description":"Unfinished entity",
 "input":"&f",
 "output":["ParseError", ["Character", "&f"]]},
 {"description":"Ampersand, number sign",
 "input":"&#",
 "output":["ParseError", ["Character", "&#"]]},
 {"description":"Unfinished numeric entity",
 "input":"&#x",
 "output":["ParseError", ["Character", "&#x"]]},
 {"description":"Entity with trailing semicolon (1)",
 "input":"I'm &not;it",
 "output":[["Character","I'm \u00ACit"]]},
 {"description":"Entity with trailing semicolon (2)",
 "input":"I'm &notin;",
 "output":[["Character","I'm \u2209"]]},
 {"description":"Entity without trailing semicolon (1)",
 "input":"I'm &notit",
 "output":[["Character","I'm "], "ParseError", ["Character", "\u00ACit"]]},
 {"description":"Entity without trailing semicolon (2)",
 "input":"I'm &notin",
 "output":[["Character","I'm "], "ParseError", ["Character", "\u00ACin"]]},
 {"description":"Partial entity match at end of file",
 "input":"I'm &no",
 "output":[["Character","I'm "], "ParseError", ["Character", "&no"]]},
 {"description":"Non-ASCII character reference name",
 "input":"&\u00AC;",
 "output":["ParseError", ["Character", "&\u00AC;"]]},
 {"description":"ASCII decimal entity",
 "input":"&#0036;",
 "output":[["Character","$"]]},
 {"description":"ASCII hexadecimal entity",
 "input":"&#x3f;",
 "output":[["Character","?"]]},
 {"description":"Hexadecimal entity in attribute",
 "input":"<h a='&#x3f;'></h>",
 "output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]},
 {"description":"Entity in attribute without semicolon ending in x",
 "input":"<h a='&notx'>",
 "output":["ParseError", ["StartTag", "h", {"a":"&notx"}]]},
 {"description":"Entity in attribute without semicolon ending in 1",
 "input":"<h a='&not1'>",
 "output":["ParseError", ["StartTag", "h", {"a":"&not1"}]]},
 {"description":"Entity in attribute without semicolon ending in i",
 "input":"<h a='&noti'>",
 "output":["ParseError", ["StartTag", "h", {"a":"&noti"}]]},
 {"description":"Entity in attribute without semicolon",
 "input":"<h a='&COPY'>",
 "output":["ParseError", ["StartTag", "h", {"a":"\u00A9"}]]},
 {"description":"Unquoted attribute ending in ampersand",
 "input":"<s o=& t>",
 "output":[["StartTag","s",{"o":"&","t":""}]]},
 {"description":"Unquoted attribute at end of tag with final character of &, with tag followed by characters",
 "input":"<a a=a&>foo",
 "output":[["StartTag", "a", {"a":"a&"}], ["Character", "foo"]]},
 {"description":"plaintext element",
 "input":"<plaintext>foobar",
 "output":[["StartTag","plaintext",{}], ["Character","foobar"]]},
 {"description":"Open angled bracket in unquoted attribute value state",
 "input":"<a a=f<>",
 "output":["ParseError", ["StartTag", "a", {"a":"f<"}]]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/test2.test
+++ b/lib/html5lib/tests/testdata/tokenizer/test2.test
@ -1,179 +0,0 @@
 {"tests": [
 {"description":"DOCTYPE without name",
 "input":"<!DOCTYPE>",
 "output":["ParseError", "ParseError", ["DOCTYPE", "", null, null, false]]},
 {"description":"DOCTYPE without space before name",
 "input":"<!DOCTYPEhtml>",
 "output":["ParseError", ["DOCTYPE", "html", null, null, true]]},
 {"description":"Incorrect DOCTYPE without a space before name",
 "input":"<!DOCTYPEfoo>",
 "output":["ParseError", ["DOCTYPE", "foo", null, null, true]]},
 {"description":"DOCTYPE with publicId",
 "input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
 "output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", null, true]]},
 {"description":"DOCTYPE with EOF after PUBLIC",
 "input":"<!DOCTYPE html PUBLIC",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"DOCTYPE with EOF after PUBLIC '",
 "input":"<!DOCTYPE html PUBLIC '",
 "output":["ParseError", ["DOCTYPE", "html", "", null, false]]},
 {"description":"DOCTYPE with EOF after PUBLIC 'x",
 "input":"<!DOCTYPE html PUBLIC 'x",
 "output":["ParseError", ["DOCTYPE", "html", "x", null, false]]},
 {"description":"DOCTYPE with systemId",
 "input":"<!DOCTYPE html SYSTEM \"-//W3C//DTD HTML Transitional 4.01//EN\">",
 "output":[["DOCTYPE", "html", null, "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
 {"description":"DOCTYPE with publicId and systemId",
 "input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\" \"-//W3C//DTD HTML Transitional 4.01//EN\">",
 "output":[["DOCTYPE", "html", "-//W3C//DTD HTML Transitional 4.01//EN", "-//W3C//DTD HTML Transitional 4.01//EN", true]]},
 {"description":"DOCTYPE with > in double-quoted publicId",
 "input":"<!DOCTYPE html PUBLIC \">x",
 "output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
 {"description":"DOCTYPE with > in single-quoted publicId",
 "input":"<!DOCTYPE html PUBLIC '>x",
 "output":["ParseError", ["DOCTYPE", "html", "", null, false], ["Character", "x"]]},
 {"description":"DOCTYPE with > in double-quoted systemId",
 "input":"<!DOCTYPE html PUBLIC \"foo\" \">x",
 "output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
 {"description":"DOCTYPE with > in single-quoted systemId",
 "input":"<!DOCTYPE html PUBLIC 'foo' '>x",
 "output":["ParseError", ["DOCTYPE", "html", "foo", "", false], ["Character", "x"]]},
 {"description":"Incomplete doctype",
 "input":"<!DOCTYPE html ",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"Numeric entity representing the NUL character",
 "input":"&#0000;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"Hexadecimal entity representing the NUL character",
 "input":"&#x0000;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
 "input":"&#2225222;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
 "input":"&#x1010FFFF;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"Hexadecimal entity pair representing a surrogate pair",
 "input":"&#xD869;&#xDED6;",
 "output":["ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"]]},
 {"description":"Hexadecimal entity with mixed uppercase and lowercase",
 "input":"&#xaBcD;",
 "output":[["Character", "\uABCD"]]},
 {"description":"Entity without a name",
 "input":"&;",
 "output":["ParseError", ["Character", "&;"]]},
 {"description":"Unescaped ampersand in attribute value",
 "input":"<h a='&'>",
 "output":[["StartTag", "h", { "a":"&" }]]},
 {"description":"StartTag containing <",
 "input":"<a<b>",
 "output":[["StartTag", "a<b", { }]]},
 {"description":"Non-void element containing trailing /",
 "input":"<h/>",
 "output":[["StartTag","h",{},true]]},
 {"description":"Void element with permitted slash",
 "input":"<br/>",
 "output":[["StartTag","br",{},true]]},
 {"description":"Void element with permitted slash (with attribute)",
 "input":"<br foo='bar'/>",
 "output":[["StartTag","br",{"foo":"bar"},true]]},
 {"description":"StartTag containing /",
 "input":"<h/a='b'>",
 "output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
 {"description":"Double-quoted attribute value",
 "input":"<h a=\"b\">",
 "output":[["StartTag", "h", { "a":"b" }]]},
 {"description":"Unescaped </",
 "input":"</",
 "output":["ParseError", ["Character", "</"]]},
 {"description":"Illegal end tag name",
 "input":"</1>",
 "output":["ParseError", ["Comment", "1"]]},
 {"description":"Simili processing instruction",
 "input":"<?namespace>",
 "output":["ParseError", ["Comment", "?namespace"]]},
 {"description":"A bogus comment stops at >, even if preceeded by two dashes",
 "input":"<?foo-->",
 "output":["ParseError", ["Comment", "?foo--"]]},
 {"description":"Unescaped <",
 "input":"foo < bar",
 "output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
 {"description":"Null Byte Replacement",
 "input":"\u0000",
 "output":["ParseError", ["Character", "\u0000"]]},
 {"description":"Comment with dash",
 "input":"<!---x",
 "output":["ParseError", ["Comment", "-x"]]},
 {"description":"Entity + newline",
 "input":"\nx\n&gt;\n",
 "output":[["Character","\nx\n>\n"]]},
 {"description":"Start tag with no attributes but space before the greater-than sign",
 "input":"<h >",
 "output":[["StartTag", "h", {}]]},
 {"description":"Empty attribute followed by uppercase attribute",
 "input":"<h a B=''>",
 "output":[["StartTag", "h", {"a":"", "b":""}]]},
 {"description":"Double-quote after attribute name",
 "input":"<h a \">",
 "output":["ParseError", ["StartTag", "h", {"a":"", "\"":""}]]},
 {"description":"Single-quote after attribute name",
 "input":"<h a '>",
 "output":["ParseError", ["StartTag", "h", {"a":"", "'":""}]]},
 {"description":"Empty end tag with following characters",
 "input":"a</>bc",
 "output":[["Character", "a"], "ParseError", ["Character", "bc"]]},
 {"description":"Empty end tag with following tag",
 "input":"a</><b>c",
 "output":[["Character", "a"], "ParseError", ["StartTag", "b", {}], ["Character", "c"]]},
 {"description":"Empty end tag with following comment",
 "input":"a</><!--b-->c",
 "output":[["Character", "a"], "ParseError", ["Comment", "b"], ["Character", "c"]]},
 {"description":"Empty end tag with following end tag",
 "input":"a</></b>c",
 "output":[["Character", "a"], "ParseError", ["EndTag", "b"], ["Character", "c"]]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/test3.test
+++ b/lib/html5lib/tests/testdata/tokenizer/test3.test
--- a/lib/html5lib/tests/testdata/tokenizer/test4.test
+++ b/lib/html5lib/tests/testdata/tokenizer/test4.test
@ -1,344 +0,0 @@
 {"tests": [
 {"description":"< in attribute name",
 "input":"<z/0  <>",
 "output":["ParseError", "ParseError", ["StartTag", "z", {"0": "", "<": ""}]]},
 {"description":"< in attribute value",
 "input":"<z x=<>",
 "output":["ParseError", ["StartTag", "z", {"x": "<"}]]},
 {"description":"= in unquoted attribute value",
 "input":"<z z=z=z>",
 "output":["ParseError", ["StartTag", "z", {"z": "z=z"}]]},
 {"description":"= attribute",
 "input":"<z =>",
 "output":["ParseError", ["StartTag", "z", {"=": ""}]]},
 {"description":"== attribute",
 "input":"<z ==>",
 "output":["ParseError", "ParseError", ["StartTag", "z", {"=": ""}]]},
 {"description":"=== attribute",
 "input":"<z ===>",
 "output":["ParseError", "ParseError", ["StartTag", "z", {"=": "="}]]},
 {"description":"==== attribute",
 "input":"<z ====>",
 "output":["ParseError", "ParseError", "ParseError", ["StartTag", "z", {"=": "=="}]]},
 {"description":"Allowed \" after ampersand in attribute value",
 "input":"<z z=\"&\">",
 "output":[["StartTag", "z", {"z": "&"}]]},
 {"description":"Non-allowed ' after ampersand in attribute value",
 "input":"<z z=\"&'\">",
 "output":["ParseError", ["StartTag", "z", {"z": "&'"}]]},
 {"description":"Allowed ' after ampersand in attribute value",
 "input":"<z z='&'>",
 "output":[["StartTag", "z", {"z": "&"}]]},
 {"description":"Non-allowed \" after ampersand in attribute value",
 "input":"<z z='&\"'>",
 "output":["ParseError", ["StartTag", "z", {"z": "&\""}]]},
 {"description":"Text after bogus character reference",
 "input":"<z z='&xlink_xmlns;'>bar<z>",
 "output":["ParseError",["StartTag","z",{"z":"&xlink_xmlns;"}],["Character","bar"],["StartTag","z",{}]]},
 {"description":"Text after hex character reference",
 "input":"<z z='&#x0020; foo'>bar<z>",
 "output":[["StartTag","z",{"z":"  foo"}],["Character","bar"],["StartTag","z",{}]]},
 {"description":"Attribute name starting with \"",
 "input":"<foo \"='bar'>",
 "output":["ParseError", ["StartTag", "foo", {"\"": "bar"}]]},
 {"description":"Attribute name starting with '",
 "input":"<foo '='bar'>",
 "output":["ParseError", ["StartTag", "foo", {"'": "bar"}]]},
 {"description":"Attribute name containing \"",
 "input":"<foo a\"b='bar'>",
 "output":["ParseError", ["StartTag", "foo", {"a\"b": "bar"}]]},
 {"description":"Attribute name containing '",
 "input":"<foo a'b='bar'>",
 "output":["ParseError", ["StartTag", "foo", {"a'b": "bar"}]]},
 {"description":"Unquoted attribute value containing '",
 "input":"<foo a=b'c>",
 "output":["ParseError", ["StartTag", "foo", {"a": "b'c"}]]},
 {"description":"Unquoted attribute value containing \"",
 "input":"<foo a=b\"c>",
 "output":["ParseError", ["StartTag", "foo", {"a": "b\"c"}]]},
 {"description":"Double-quoted attribute value not followed by whitespace",
 "input":"<foo a=\"b\"c>",
 "output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
 {"description":"Single-quoted attribute value not followed by whitespace",
 "input":"<foo a='b'c>",
 "output":["ParseError", ["StartTag", "foo", {"a": "b", "c": ""}]]},
 {"description":"Quoted attribute followed by permitted /",
 "input":"<br a='b'/>",
 "output":[["StartTag","br",{"a":"b"},true]]},
 {"description":"Quoted attribute followed by non-permitted /",
 "input":"<bar a='b'/>",
 "output":[["StartTag","bar",{"a":"b"},true]]},
 {"description":"CR EOF after doctype name",
 "input":"<!doctype html \r",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"CR EOF in tag name",
 "input":"<z\r",
 "output":["ParseError"]},
 {"description":"Slash EOF in tag name",
 "input":"<z/",
 "output":["ParseError"]},
 {"description":"Zero hex numeric entity",
 "input":"&#x0",
 "output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
 {"description":"Zero decimal numeric entity",
 "input":"&#0",
 "output":["ParseError", "ParseError", ["Character", "\uFFFD"]]},
 {"description":"Zero-prefixed hex numeric entity",
 "input":"&#x000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000041;",
 "output":[["Character", "A"]]},
 {"description":"Zero-prefixed decimal numeric entity",
 "input":"&#000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000065;",
 "output":[["Character", "A"]]},
 {"description":"Empty hex numeric entities",
 "input":"&#x &#X ",
 "output":["ParseError", ["Character", "&#x "], "ParseError", ["Character", "&#X "]]},
 {"description":"Empty decimal numeric entities",
 "input":"&# &#; ",
 "output":["ParseError", ["Character", "&# "], "ParseError", ["Character", "&#; "]]},
 {"description":"Non-BMP numeric entity",
 "input":"&#x10000;",
 "output":[["Character", "\uD800\uDC00"]]},
 {"description":"Maximum non-BMP numeric entity",
 "input":"&#X10FFFF;",
 "output":["ParseError", ["Character", "\uDBFF\uDFFF"]]},
 {"description":"Above maximum numeric entity",
 "input":"&#x110000;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"32-bit hex numeric entity",
 "input":"&#x80000041;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"33-bit hex numeric entity",
 "input":"&#x100000041;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"33-bit decimal numeric entity",
 "input":"&#4294967361;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"65-bit hex numeric entity",
 "input":"&#x10000000000000041;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"65-bit decimal numeric entity",
 "input":"&#18446744073709551681;",
 "output":["ParseError", ["Character", "\uFFFD"]]},
 {"description":"Surrogate code point edge cases",
 "input":"&#xD7FF;&#xD800;&#xD801;&#xDFFE;&#xDFFF;&#xE000;",
 "output":[["Character", "\uD7FF"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD"], "ParseError", ["Character", "\uFFFD\uE000"]]},
 {"description":"Uppercase start tag name",
 "input":"<X>",
 "output":[["StartTag", "x", {}]]},
 {"description":"Uppercase end tag name",
 "input":"</X>",
 "output":[["EndTag", "x"]]},
 {"description":"Uppercase attribute name",
 "input":"<x X>",
 "output":[["StartTag", "x", { "x":"" }]]},
 {"description":"Tag/attribute name case edge values",
 "input":"<x@AZ[`az{ @AZ[`az{>",
 "output":[["StartTag", "x@az[`az{", { "@az[`az{":"" }]]},
 {"description":"Duplicate different-case attributes",
 "input":"<x x=1 x=2 X=3>",
 "output":["ParseError", "ParseError", ["StartTag", "x", { "x":"1" }]]},
 {"description":"Uppercase close tag attributes",
 "input":"</x X>",
 "output":["ParseError", ["EndTag", "x"]]},
 {"description":"Duplicate close tag attributes",
 "input":"</x x x>",
 "output":["ParseError", "ParseError", ["EndTag", "x"]]},
 {"description":"Permitted slash",
 "input":"<br/>",
 "output":[["StartTag","br",{},true]]},
 {"description":"Non-permitted slash",
 "input":"<xr/>",
 "output":[["StartTag","xr",{},true]]},
 {"description":"Permitted slash but in close tag",
 "input":"</br/>",
 "output":["ParseError", ["EndTag", "br"]]},
 {"description":"Doctype public case-sensitivity (1)",
 "input":"<!DoCtYpE HtMl PuBlIc \"AbC\" \"XyZ\">",
 "output":[["DOCTYPE", "html", "AbC", "XyZ", true]]},
 {"description":"Doctype public case-sensitivity (2)",
 "input":"<!dOcTyPe hTmL pUbLiC \"aBc\" \"xYz\">",
 "output":[["DOCTYPE", "html", "aBc", "xYz", true]]},
 {"description":"Doctype system case-sensitivity (1)",
 "input":"<!DoCtYpE HtMl SyStEm \"XyZ\">",
 "output":[["DOCTYPE", "html", null, "XyZ", true]]},
 {"description":"Doctype system case-sensitivity (2)",
 "input":"<!dOcTyPe hTmL sYsTeM \"xYz\">",
 "output":[["DOCTYPE", "html", null, "xYz", true]]},
 {"description":"U+0000 in lookahead region after non-matching character",
 "input":"<!doc>\u0000",
 "output":["ParseError", ["Comment", "doc"], "ParseError", ["Character", "\u0000"]],
 "ignoreErrorOrder":true},
 {"description":"U+0000 in lookahead region",
 "input":"<!doc\u0000",
 "output":["ParseError", ["Comment", "doc\uFFFD"]],
 "ignoreErrorOrder":true},
 {"description":"U+0080 in lookahead region",
 "input":"<!doc\u0080",
 "output":["ParseError", "ParseError", ["Comment", "doc\u0080"]],
 "ignoreErrorOrder":true},
 {"description":"U+FDD1 in lookahead region",
 "input":"<!doc\uFDD1",
 "output":["ParseError", "ParseError", ["Comment", "doc\uFDD1"]],
 "ignoreErrorOrder":true},
 {"description":"U+1FFFF in lookahead region",
 "input":"<!doc\uD83F\uDFFF",
 "output":["ParseError", "ParseError", ["Comment", "doc\uD83F\uDFFF"]],
 "ignoreErrorOrder":true},
 {"description":"CR followed by non-LF",
 "input":"\r?",
 "output":[["Character", "\n?"]]},
 {"description":"CR at EOF",
 "input":"\r",
 "output":[["Character", "\n"]]},
 {"description":"LF at EOF",
 "input":"\n",
 "output":[["Character", "\n"]]},
 {"description":"CR LF",
 "input":"\r\n",
 "output":[["Character", "\n"]]},
 {"description":"CR CR",
 "input":"\r\r",
 "output":[["Character", "\n\n"]]},
 {"description":"LF LF",
 "input":"\n\n",
 "output":[["Character", "\n\n"]]},
 {"description":"LF CR",
 "input":"\n\r",
 "output":[["Character", "\n\n"]]},
 {"description":"text CR CR CR text",
 "input":"text\r\r\rtext",
 "output":[["Character", "text\n\n\ntext"]]},
 {"description":"Doctype publik",
 "input":"<!DOCTYPE html PUBLIK \"AbC\" \"XyZ\">",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"Doctype publi",
 "input":"<!DOCTYPE html PUBLI",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"Doctype sistem",
 "input":"<!DOCTYPE html SISTEM \"AbC\">",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"Doctype sys",
 "input":"<!DOCTYPE html SYS",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false]]},
 {"description":"Doctype html x>text",
 "input":"<!DOCTYPE html x>text",
 "output":["ParseError", ["DOCTYPE", "html", null, null, false], ["Character", "text"]]},
 {"description":"Grave accent in unquoted attribute",
 "input":"<a a=aa`>",
 "output":["ParseError", ["StartTag", "a", {"a":"aa`"}]]},
 {"description":"EOF in tag name state ",
 "input":"<a",
 "output":["ParseError"]},
 {"description":"EOF in tag name state",
 "input":"<a",
 "output":["ParseError"]},
 {"description":"EOF in before attribute name state",
 "input":"<a ",
 "output":["ParseError"]},
 {"description":"EOF in attribute name state",
 "input":"<a a",
 "output":["ParseError"]},
 {"description":"EOF in after attribute name state",
 "input":"<a a ",
 "output":["ParseError"]},
 {"description":"EOF in before attribute value state",
 "input":"<a a =",
 "output":["ParseError"]},
 {"description":"EOF in attribute value (double quoted) state",
 "input":"<a a =\"a",
 "output":["ParseError"]},
 {"description":"EOF in attribute value (single quoted) state",
 "input":"<a a ='a",
 "output":["ParseError"]},
 {"description":"EOF in attribute value (unquoted) state",
 "input":"<a a =a",
 "output":["ParseError"]},
 {"description":"EOF in after attribute value state",
 "input":"<a a ='a'",
 "output":["ParseError"]}
 ]}
--- a/lib/html5lib/tests/testdata/tokenizer/unicodeChars.test
+++ b/lib/html5lib/tests/testdata/tokenizer/unicodeChars.test
--- a/lib/html5lib/tests/testdata/tokenizer/unicodeCharsProblematic.test
+++ b/lib/html5lib/tests/testdata/tokenizer/unicodeCharsProblematic.test
@ -1,27 +0,0 @@
 {"tests" : [
 {"description": "Invalid Unicode character U+DFFF",
 "doubleEscaped":true,
 "input": "\\uDFFF",
 "output":["ParseError", ["Character", "\\uFFFD"]]},
 {"description": "Invalid Unicode character U+D800",
 "doubleEscaped":true,
 "input": "\\uD800",
 "output":["ParseError", ["Character", "\\uFFFD"]]},
 {"description": "Invalid Unicode character U+DFFF with valid preceding character",
 "doubleEscaped":true,
 "input": "a\\uDFFF",
 "output":["ParseError", ["Character", "a\\uFFFD"]]},
 {"description": "Invalid Unicode character U+D800 with valid following character",
 "doubleEscaped":true,
 "input": "\\uD800a",
 "output":["ParseError", ["Character", "\\uFFFDa"]]},
 {"description":"CR followed by U+0000",
 "input":"\r\u0000",
 "output":[["Character", "\n"], "ParseError", ["Character", "\u0000"]],
 "ignoreErrorOrder":true}
 ]
 }
--- a/lib/html5lib/tests/testdata/tokenizer/xmlViolation.test
+++ b/lib/html5lib/tests/testdata/tokenizer/xmlViolation.test
@ -1,22 +0,0 @@
 {"xmlViolationTests": [
 {"description":"Non-XML character",
 "input":"a\uFFFFb",
 "ignoreErrorOrder":true,
 "output":["ParseError",["Character","a\uFFFDb"]]},
 {"description":"Non-XML space",
 "input":"a\u000Cb",
 "ignoreErrorOrder":true,
 "output":[["Character","a b"]]},
 {"description":"Double hyphen in comment",
 "input":"<!-- foo -- bar -->",
 "output":["ParseError",["Comment"," foo - - bar "]]},
 {"description":"FF between attributes",
 "input":"<a b=''\u000Cc=''>",
 "output":[["StartTag","a",{"b":"","c":""}]]}
 ]}
--- a/lib/html5lib/tests/testdata/tree-construction/adoption01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/adoption01.dat
@ -1,194 +0,0 @@
 #data
 <a><p></a></p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |     <p>
 |       <a>
 #data
 <a>1<p>2</a>3</p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "1"
 |     <p>
 |       <a>
 |         "2"
 |       "3"
 #data
 <a>1<button>2</a>3</button>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "1"
 |     <button>
 |       <a>
 |         "2"
 |       "3"
 #data
 <a>1<b>2</a>3</b>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "1"
 |       <b>
 |         "2"
 |     <b>
 |       "3"
 #data
 <a>1<div>2<div>3</a>4</div>5</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "1"
 |     <div>
 |       <a>
 |         "2"
 |       <div>
 |         <a>
 |           "3"
 |         "4"
 |       "5"
 #data
 <table><a>1<p>2</a>3</p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "1"
 |     <p>
 |       <a>
 |         "2"
 |       "3"
 |     <table>
 #data
 <b><b><a><p></a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <b>
 |         <a>
 |         <p>
 |           <a>
 #data
 <b><a><b><p></a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <a>
 |         <b>
 |       <b>
 |         <p>
 |           <a>
 #data
 <a><b><b><p></a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       <b>
 |         <b>
 |     <b>
 |       <b>
 |         <p>
 |           <a>
 #data
 <p>1<s id="A">2<b id="B">3</p>4</s>5</b>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       "1"
 |       <s>
 |         id="A"
 |         "2"
 |         <b>
 |           id="B"
 |           "3"
 |     <s>
 |       id="A"
 |       <b>
 |         id="B"
 |         "4"
 |     <b>
 |       id="B"
 |       "5"
 #data
 <table><a>1<td>2</td>3</table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "1"
 |     <a>
 |       "3"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "2"
 #data
 <table>A<td>B</td>C</table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "AC"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "B"
 #data
 <a><svg><tr><input></a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       <svg svg>
 |         <svg tr>
 |           <svg input>
--- a/lib/html5lib/tests/testdata/tree-construction/adoption02.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/adoption02.dat
@ -1,31 +0,0 @@
 #data
 <b>1<i>2<p>3</b>4
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       "1"
 |       <i>
 |         "2"
 |     <i>
 |       <p>
 |         <b>
 |           "3"
 |         "4"
 #data
 <a><div><style></style><address><a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |     <div>
 |       <a>
 |         <style>
 |       <address>
 |         <a>
 |         <a>
--- a/lib/html5lib/tests/testdata/tree-construction/comments01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/comments01.dat
@ -1,135 +0,0 @@
 #data
 FOO<!-- BAR -->BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  BAR  -->
 |     "BAZ"
 #data
 FOO<!-- BAR --!>BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  BAR  -->
 |     "BAZ"
 #data
 FOO<!-- BAR --   >BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  BAR --   >BAZ -->
 #data
 FOO<!-- BAR -- <QUX> -- MUX -->BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  BAR -- <QUX> -- MUX  -->
 |     "BAZ"
 #data
 FOO<!-- BAR -- <QUX> -- MUX --!>BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  BAR -- <QUX> -- MUX  -->
 |     "BAZ"
 #data
 FOO<!-- BAR -- <QUX> -- MUX -- >BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  BAR -- <QUX> -- MUX -- >BAZ -->
 #data
 FOO<!---->BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  -->
 |     "BAZ"
 #data
 FOO<!--->BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  -->
 |     "BAZ"
 #data
 FOO<!-->BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!--  -->
 |     "BAZ"
 #data
 <?xml version="1.0">Hi
 #errors
 #document
 | <!-- ?xml version="1.0" -->
 | <html>
 |   <head>
 |   <body>
 |     "Hi"
 #data
 <?xml version="1.0">
 #errors
 #document
 | <!-- ?xml version="1.0" -->
 | <html>
 |   <head>
 |   <body>
 #data
 <?xml version
 #errors
 #document
 | <!-- ?xml version -->
 | <html>
 |   <head>
 |   <body>
 #data
 FOO<!----->BAZ
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <!-- - -->
 |     "BAZ"
--- a/lib/html5lib/tests/testdata/tree-construction/doctype01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/doctype01.dat
@ -1,370 +0,0 @@
 #data
 <!DOCTYPE html>Hello
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!dOctYpE HtMl>Hello
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPEhtml>Hello
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE>Hello
 #errors
 #document
 | <!DOCTYPE >
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE >Hello
 #errors
 #document
 | <!DOCTYPE >
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato >Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato taco>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato taco "ddd>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato sYstEM>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato sYstEM    >Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE   potato       sYstEM  ggg>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato SYSTEM taco  >Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato SYSTEM 'taco"'>Hello
 #errors
 #document
 | <!DOCTYPE potato "" "taco"">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato SYSTEM "taco">Hello
 #errors
 #document
 | <!DOCTYPE potato "" "taco">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato SYSTEM "tai'co">Hello
 #errors
 #document
 | <!DOCTYPE potato "" "tai'co">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato SYSTEMtaco "ddd">Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato grass SYSTEM taco>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato pUbLIc>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato pUbLIc >Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato pUbLIcgoof>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato PUBLIC goof>Hello
 #errors
 #document
 | <!DOCTYPE potato>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato PUBLIC "go'of">Hello
 #errors
 #document
 | <!DOCTYPE potato "go'of" "">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato PUBLIC 'go'of'>Hello
 #errors
 #document
 | <!DOCTYPE potato "go" "">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato PUBLIC 'go:hh   of' >Hello
 #errors
 #document
 | <!DOCTYPE potato "go:hh   of" "">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE potato PUBLIC "W3C-//dfdf" SYSTEM ggg>Hello
 #errors
 #document
 | <!DOCTYPE potato "W3C-//dfdf" "">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
   "http://www.w3.org/TR/html4/strict.dtd">Hello
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE ...>Hello
 #errors
 #document
 | <!DOCTYPE ...>
 | <html>
 |   <head>
 |   <body>
 |     "Hello"
 #data
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD XHTML 1.0 Frameset//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE root-element [SYSTEM OR PUBLIC FPI] "uri" [ 
 <!-- internal declarations -->
 ]>
 #errors
 #document
 | <!DOCTYPE root-element>
 | <html>
 |   <head>
 |   <body>
 |     "]>"
 #data
 <!DOCTYPE html PUBLIC
  "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"
    "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
 #errors
 #document
 | <!DOCTYPE html "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE HTML SYSTEM "http://www.w3.org/DTD/HTML4-strict.dtd"><body><b>Mine!</b></body>
 #errors
 #document
 | <!DOCTYPE html "" "http://www.w3.org/DTD/HTML4-strict.dtd">
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       "Mine!"
 #data
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd">
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE HTML PUBLIC"-//W3C//DTD HTML 4.01//EN"'http://www.w3.org/TR/html4/strict.dtd'>
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE HTML PUBLIC'-//W3C//DTD HTML 4.01//EN''http://www.w3.org/TR/html4/strict.dtd'>
 #errors
 #document
 | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 | <html>
 |   <head>
 |   <body>
--- a/lib/html5lib/tests/testdata/tree-construction/domjs-unsafe.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/domjs-unsafe.dat
--- a/lib/html5lib/tests/testdata/tree-construction/entities01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/entities01.dat
@ -1,603 +0,0 @@
 #data
 FOO&gt;BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO>BAR"
 #data
 FOO&gtBAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO>BAR"
 #data
 FOO&gt BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO> BAR"
 #data
 FOO&gt;;;BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO>;;BAR"
 #data
 I'm &notit; I tell you
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "I'm ¬it; I tell you"
 #data
 I'm &notin; I tell you
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "I'm ∉ I tell you"
 #data
 FOO& BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO& BAR"
 #data
 FOO&<BAR>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO&"
 |     <bar>
 #data
 FOO&&&&gt;BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO&&&>BAR"
 #data
 FOO&#41;BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO)BAR"
 #data
 FOO&#x41;BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOABAR"
 #data
 FOO&#X41;BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOABAR"
 #data
 FOO&#BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO&#BAR"
 #data
 FOO&#ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO&#ZOO"
 #data
 FOO&#xBAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOºR"
 #data
 FOO&#xZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO&#xZOO"
 #data
 FOO&#XZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO&#XZOO"
 #data
 FOO&#41BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO)BAR"
 #data
 FOO&#x41BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO䆺R"
 #data
 FOO&#x41ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOAZOO"
 #data
 FOO&#x0000;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
 #data
 FOO&#x0078;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOxZOO"
 #data
 FOO&#x0079;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOyZOO"
 #data
 FOO&#x0080;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO€ZOO"
 #data
 FOO&#x0081;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOZOO"
 #data
 FOO&#x0082;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO‚ZOO"
 #data
 FOO&#x0083;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOƒZOO"
 #data
 FOO&#x0084;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO„ZOO"
 #data
 FOO&#x0085;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO…ZOO"
 #data
 FOO&#x0086;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO†ZOO"
 #data
 FOO&#x0087;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO‡ZOO"
 #data
 FOO&#x0088;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOˆZOO"
 #data
 FOO&#x0089;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO‰ZOO"
 #data
 FOO&#x008A;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOŠZOO"
 #data
 FOO&#x008B;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO‹ZOO"
 #data
 FOO&#x008C;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOŒZOO"
 #data
 FOO&#x008D;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOZOO"
 #data
 FOO&#x008E;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOŽZOO"
 #data
 FOO&#x008F;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOZOO"
 #data
 FOO&#x0090;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOZOO"
 #data
 FOO&#x0091;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO‘ZOO"
 #data
 FOO&#x0092;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO’ZOO"
 #data
 FOO&#x0093;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO“ZOO"
 #data
 FOO&#x0094;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO”ZOO"
 #data
 FOO&#x0095;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO•ZOO"
 #data
 FOO&#x0096;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO–ZOO"
 #data
 FOO&#x0097;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO—ZOO"
 #data
 FOO&#x0098;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO˜ZOO"
 #data
 FOO&#x0099;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO™ZOO"
 #data
 FOO&#x009A;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOšZOO"
 #data
 FOO&#x009B;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO›ZOO"
 #data
 FOO&#x009C;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOœZOO"
 #data
 FOO&#x009D;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOZOO"
 #data
 FOO&#x009E;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOžZOO"
 #data
 FOO&#x009F;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOŸZOO"
 #data
 FOO&#x00A0;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO ZOO"
 #data
 FOO&#xD7FF;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO퟿ZOO"
 #data
 FOO&#xD800;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
 #data
 FOO&#xD801;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
 #data
 FOO&#xDFFE;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
 #data
 FOO&#xDFFF;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
 #data
 FOO&#xE000;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOOZOO"
 #data
 FOO&#x10FFFE;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO􏿾ZOO"
 #data
 FOO&#x1087D4;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO􈟔ZOO"
 #data
 FOO&#x10FFFF;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO􏿿ZOO"
 #data
 FOO&#x110000;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
 #data
 FOO&#xFFFFFF;ZOO
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO<4F>ZOO"
--- a/lib/html5lib/tests/testdata/tree-construction/entities02.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/entities02.dat
@ -1,249 +0,0 @@
 #data
 <div bar="ZZ&gt;YY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ>YY"
 #data
 <div bar="ZZ&"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&"
 #data
 <div bar='ZZ&'></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&"
 #data
 <div bar=ZZ&></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&"
 #data
 <div bar="ZZ&gt=YY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&gt=YY"
 #data
 <div bar="ZZ&gt0YY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&gt0YY"
 #data
 <div bar="ZZ&gt9YY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&gt9YY"
 #data
 <div bar="ZZ&gtaYY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&gtaYY"
 #data
 <div bar="ZZ&gtZYY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&gtZYY"
 #data
 <div bar="ZZ&gt YY"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ> YY"
 #data
 <div bar="ZZ&gt"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ>"
 #data
 <div bar='ZZ&gt'></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ>"
 #data
 <div bar=ZZ&gt></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ>"
 #data
 <div bar="ZZ&pound_id=23"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ£_id=23"
 #data
 <div bar="ZZ&prod_id=23"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&prod_id=23"
 #data
 <div bar="ZZ&pound;_id=23"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ£_id=23"
 #data
 <div bar="ZZ&prod;_id=23"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ∏_id=23"
 #data
 <div bar="ZZ&pound=23"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&pound=23"
 #data
 <div bar="ZZ&prod=23"></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       bar="ZZ&prod=23"
 #data
 <div>ZZ&pound_id=23</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "ZZ£_id=23"
 #data
 <div>ZZ&prod_id=23</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "ZZ&prod_id=23"
 #data
 <div>ZZ&pound;_id=23</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "ZZ£_id=23"
 #data
 <div>ZZ&prod;_id=23</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "ZZ∏_id=23"
 #data
 <div>ZZ&pound=23</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "ZZ£=23"
 #data
 <div>ZZ&prod=23</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "ZZ&prod=23"
--- a/lib/html5lib/tests/testdata/tree-construction/html5test-com.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/html5test-com.dat
@ -1,246 +0,0 @@
 #data
 <div<div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div<div>
 #data
 <div foo<bar=''>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       foo<bar=""
 #data
 <div foo=`bar`>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       foo="`bar`"
 #data
 <div \"foo=''>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       \"foo=""
 #data
 <a href='\nbar'></a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       href="\nbar"
 #data
 <!DOCTYPE html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 &lang;&rang;
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "⟨⟩"
 #data
 &apos;
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "'"
 #data
 &ImaginaryI;
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "ⅈ"
 #data
 &Kopf;
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "𝕂"
 #data
 &notinva;
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "∉"
 #data
 <?import namespace="foo" implementation="#bar">
 #errors
 #document
 | <!-- ?import namespace="foo" implementation="#bar" -->
 | <html>
 |   <head>
 |   <body>
 #data
 <!--foo--bar-->
 #errors
 #document
 | <!-- foo--bar -->
 | <html>
 |   <head>
 |   <body>
 #data
 <![CDATA[x]]>
 #errors
 #document
 | <!-- [CDATA[x]] -->
 | <html>
 |   <head>
 |   <body>
 #data
 <textarea><!--</textarea>--></textarea>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       "<!--"
 |     "-->"
 #data
 <textarea><!--</textarea>-->
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       "<!--"
 |     "-->"
 #data
 <style><!--</style>--></style>
 #errors
 #document
 | <html>
 |   <head>
 |     <style>
 |       "<!--"
 |   <body>
 |     "-->"
 #data
 <style><!--</style>-->
 #errors
 #document
 | <html>
 |   <head>
 |     <style>
 |       "<!--"
 |   <body>
 |     "-->"
 #data
 <ul><li>A </li> <li>B</li></ul>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <ul>
 |       <li>
 |         "A "
 |       " "
 |       <li>
 |         "B"
 #data
 <table><form><input type=hidden><input></form><div></div></table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <input>
 |     <div>
 |     <table>
 |       <form>
 |       <input>
 |         type="hidden"
 #data
 <i>A<b>B<p></i>C</b>D
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <i>
 |       "A"
 |       <b>
 |         "B"
 |     <b>
 |     <p>
 |       <b>
 |         <i>
 |         "C"
 |       "D"
 #data
 <div></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 #data
 <svg></svg>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <math></math>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
--- a/lib/html5lib/tests/testdata/tree-construction/inbody01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/inbody01.dat
@ -1,43 +0,0 @@
 #data
 <button>1</foo>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <button>
 |       "1"
 #data
 <foo>1<p>2</foo>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |       "1"
 |       <p>
 |         "2"
 #data
 <dd>1</foo>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <dd>
 |       "1"
 #data
 <foo>1<dd>2</foo>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |       "1"
 |       <dd>
 |         "2"
--- a/lib/html5lib/tests/testdata/tree-construction/isindex.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/isindex.dat
@ -1,40 +0,0 @@
 #data
 <isindex>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <form>
 |       <hr>
 |       <label>
 |         "This is a searchable index. Enter search keywords: "
 |         <input>
 |           name="isindex"
 |       <hr>
 #data
 <isindex name="A" action="B" prompt="C" foo="D">
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <form>
 |       action="B"
 |       <hr>
 |       <label>
 |         "C"
 |         <input>
 |           foo="D"
 |           name="isindex"
 |       <hr>
 #data
 <form><isindex>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <form>
--- a/lib/html5lib/tests/testdata/tree-construction/pending-spec-changes-plain-text-unsafe.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/pending-spec-changes-plain-text-unsafe.dat
--- a/lib/html5lib/tests/testdata/tree-construction/pending-spec-changes.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/pending-spec-changes.dat
@ -1,52 +0,0 @@
 #data
 <input type="hidden"><frameset>
 #errors
 21: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
 31: “frameset” start tag seen.
 31: End of file seen and there were open elements.
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><table><caption><svg>foo</table>bar
 #errors
 47: End tag “table” did not match the name of the current open element (“svg”).
 47: “table” closed but “caption” was still open.
 47: End tag “table” seen, but there were open elements.
 36: Unclosed element “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <svg svg>
 |           "foo"
 |     "bar"
 #data
 <table><tr><td><svg><desc><td></desc><circle>
 #errors
 7: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
 30: A table cell was implicitly closed, but there were open elements.
 26: Unclosed element “desc”.
 20: Unclosed element “svg”.
 37: Stray end tag “desc”.
 45: End of file seen and there were open elements.
 45: Unclosed element “circle”.
 7: Unclosed element “table”.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <svg svg>
 |               <svg desc>
 |           <td>
 |             <circle>
--- a/lib/html5lib/tests/testdata/tree-construction/plain-text-unsafe.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/plain-text-unsafe.dat
--- a/lib/html5lib/tests/testdata/tree-construction/scriptdata01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/scriptdata01.dat
@ -1,308 +0,0 @@
 #data
 FOO<script>'Hello'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'Hello'"
 |     "BAR"
 #data
 FOO<script></script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |     "BAR"
 #data
 FOO<script></script >BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |     "BAR"
 #data
 FOO<script></script/>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |     "BAR"
 #data
 FOO<script></script/ >BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |     "BAR"
 #data
 FOO<script type="text/plain"></scriptx>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "</scriptx>BAR"
 #data
 FOO<script></script foo=">" dd>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |     "BAR"
 #data
 FOO<script>'<'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<'"
 |     "BAR"
 #data
 FOO<script>'<!'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!'"
 |     "BAR"
 #data
 FOO<script>'<!-'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!-'"
 |     "BAR"
 #data
 FOO<script>'<!--'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!--'"
 |     "BAR"
 #data
 FOO<script>'<!---'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!---'"
 |     "BAR"
 #data
 FOO<script>'<!-->'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!-->'"
 |     "BAR"
 #data
 FOO<script>'<!-->'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!-->'"
 |     "BAR"
 #data
 FOO<script>'<!-- potato'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!-- potato'"
 |     "BAR"
 #data
 FOO<script>'<!-- <sCrIpt'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!-- <sCrIpt'"
 |     "BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt>'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt>'</script>BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt> -'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt> -'</script>BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt> --'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt> --'</script>BAR"
 #data
 FOO<script>'<!-- <sCrIpt> -->'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       "'<!-- <sCrIpt> -->'"
 |     "BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt> --!>'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt> --!>'</script>BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt> -- >'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt> -- >'</script>BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt '</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt '</script>BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt/'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt/'</script>BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt\'</script>BAR
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt\'"
 |     "BAR"
 #data
 FOO<script type="text/plain">'<!-- <sCrIpt/'</script>BAR</script>QUX
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "FOO"
 |     <script>
 |       type="text/plain"
 |       "'<!-- <sCrIpt/'</script>BAR"
 |     "QUX"
--- a/lib/html5lib/tests/testdata/tree-construction/tables01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tables01.dat
@ -1,212 +0,0 @@
 #data
 <table><th>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <th>
 #data
 <table><td>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <table><col foo='bar'>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <colgroup>
 |         <col>
 |           foo="bar"
 #data
 <table><colgroup></html>foo
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "foo"
 |     <table>
 |       <colgroup>
 #data
 <table></table><p>foo
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |     <p>
 |       "foo"
 #data
 <table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <table><select><option>3</select></table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <option>
 |         "3"
 |     <table>
 #data
 <table><select><table></table></select></table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     <table>
 |     <table>
 #data
 <table><select></table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     <table>
 #data
 <table><select><option>A<tr><td>B</td></tr></table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <option>
 |         "A"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "B"
 #data
 <table><td></body></caption></col></colgroup></html>foo
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "foo"
 #data
 <table><td>A</table>B
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "A"
 |     "B"
 #data
 <table><tr><caption>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |       <caption>
 #data
 <table><tr></body></caption></col></colgroup></html></td></th><td>foo
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "foo"
 #data
 <table><td><tr>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |         <tr>
 #data
 <table><td><button><td>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <button>
 |           <td>
 #data
 <table><tr><td><svg><desc><td>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <svg svg>
 |               <svg desc>
 |           <td>
--- a/lib/html5lib/tests/testdata/tree-construction/tests1.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests1.dat
--- a/lib/html5lib/tests/testdata/tree-construction/tests10.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests10.dat
@ -1,799 +0,0 @@
 #data
 <!DOCTYPE html><svg></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <!DOCTYPE html><svg></svg><![CDATA[a]]>
 #errors
 29: Bogus comment
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |     <!-- [CDATA[a]] -->
 #data
 <!DOCTYPE html><body><svg></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <!DOCTYPE html><body><select><svg></svg></select>
 #errors
 35: Stray “svg” start tag.
 42: Stray end tag “svg”
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!DOCTYPE html><body><select><option><svg></svg></option></select>
 #errors
 43: Stray “svg” start tag.
 50: Stray end tag “svg”
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <option>
 #data
 <!DOCTYPE html><body><table><svg></svg></table>
 #errors
 34: Start tag “svg” seen in “table”.
 41: Stray end tag “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |     <table>
 #data
 <!DOCTYPE html><body><table><svg><g>foo</g></svg></table>
 #errors
 34: Start tag “svg” seen in “table”.
 46: Stray end tag “g”.
 53: Stray end tag “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |     <table>
 #data
 <!DOCTYPE html><body><table><svg><g>foo</g><g>bar</g></svg></table>
 #errors
 34: Start tag “svg” seen in “table”.
 46: Stray end tag “g”.
 58: Stray end tag “g”.
 65: Stray end tag “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |       <svg g>
 |         "bar"
 |     <table>
 #data
 <!DOCTYPE html><body><table><tbody><svg><g>foo</g><g>bar</g></svg></tbody></table>
 #errors
 41: Start tag “svg” seen in “table”.
 53: Stray end tag “g”.
 65: Stray end tag “g”.
 72: Stray end tag “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |       <svg g>
 |         "bar"
 |     <table>
 |       <tbody>
 #data
 <!DOCTYPE html><body><table><tbody><tr><svg><g>foo</g><g>bar</g></svg></tr></tbody></table>
 #errors
 45: Start tag “svg” seen in “table”.
 57: Stray end tag “g”.
 69: Stray end tag “g”.
 76: Stray end tag “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |       <svg g>
 |         "bar"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!DOCTYPE html><body><table><tbody><tr><td><svg><g>foo</g><g>bar</g></svg></td></tr></tbody></table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <svg svg>
 |               <svg g>
 |                 "foo"
 |               <svg g>
 |                 "bar"
 #data
 <!DOCTYPE html><body><table><tbody><tr><td><svg><g>foo</g><g>bar</g></svg><p>baz</td></tr></tbody></table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <svg svg>
 |               <svg g>
 |                 "foo"
 |               <svg g>
 |                 "bar"
 |             <p>
 |               "baz"
 #data
 <!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g></svg><p>baz</caption></table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <svg svg>
 |           <svg g>
 |             "foo"
 |           <svg g>
 |             "bar"
 |         <p>
 |           "baz"
 #data
 <!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
 #errors
 70: HTML start tag “p” in a foreign namespace context.
 81: “table” closed but “caption” was still open.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <svg svg>
 |           <svg g>
 |             "foo"
 |           <svg g>
 |             "bar"
 |         <p>
 |           "baz"
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><caption><svg><g>foo</g><g>bar</g>baz</table><p>quux
 #errors
 78: “table” closed but “caption” was still open.
 78: Unclosed elements on stack.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <svg svg>
 |           <svg g>
 |             "foo"
 |           <svg g>
 |             "bar"
 |           "baz"
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><colgroup><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
 #errors
 44: Start tag “svg” seen in “table”.
 56: Stray end tag “g”.
 68: Stray end tag “g”.
 71: HTML start tag “p” in a foreign namespace context.
 71: Start tag “p” seen in “table”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |       <svg g>
 |         "bar"
 |     <p>
 |       "baz"
 |     <table>
 |       <colgroup>
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><tr><td><select><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
 #errors
 50: Stray “svg” start tag.
 54: Stray “g” start tag.
 62: Stray end tag “g”
 66: Stray “g” start tag.
 74: Stray end tag “g”
 77: Stray “p” start tag.
 88: “table” end tag with “select” open.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <select>
 |               "foobarbaz"
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><select><svg><g>foo</g><g>bar</g><p>baz</table><p>quux
 #errors
 36: Start tag “select” seen in “table”.
 42: Stray “svg” start tag.
 46: Stray “g” start tag.
 54: Stray end tag “g”
 58: Stray “g” start tag.
 66: Stray end tag “g”
 69: Stray “p” start tag.
 80: “table” end tag with “select” open.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       "foobarbaz"
 |     <table>
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body></body></html><svg><g>foo</g><g>bar</g><p>baz
 #errors
 41: Stray “svg” start tag.
 68: HTML start tag “p” in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |       <svg g>
 |         "bar"
 |     <p>
 |       "baz"
 #data
 <!DOCTYPE html><body></body><svg><g>foo</g><g>bar</g><p>baz
 #errors
 34: Stray “svg” start tag.
 61: HTML start tag “p” in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg g>
 |         "foo"
 |       <svg g>
 |         "bar"
 |     <p>
 |       "baz"
 #data
 <!DOCTYPE html><frameset><svg><g></g><g></g><p><span>
 #errors
 31: Stray “svg” start tag.
 35: Stray “g” start tag.
 40: Stray end tag “g”
 44: Stray “g” start tag.
 49: Stray end tag “g”
 52: Stray “p” start tag.
 58: Stray “span” start tag.
 58: End of file seen and there were open elements.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><frameset></frameset><svg><g></g><g></g><p><span>
 #errors
 42: Stray “svg” start tag.
 46: Stray “g” start tag.
 51: Stray end tag “g”
 55: Stray “g” start tag.
 60: Stray end tag “g”
 63: Stray “p” start tag.
 69: Stray “span” start tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><body xlink:href=foo><svg xlink:href=foo></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     <svg svg>
 |       xlink href="foo"
 #data
 <!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo></g></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     xml:lang="en"
 |     <svg svg>
 |       <svg g>
 |         xlink href="foo"
 |         xml lang="en"
 #data
 <!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo /></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     xml:lang="en"
 |     <svg svg>
 |       <svg g>
 |         xlink href="foo"
 |         xml lang="en"
 #data
 <!DOCTYPE html><body xlink:href=foo xml:lang=en><svg><g xml:lang=en xlink:href=foo />bar</svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     xml:lang="en"
 |     <svg svg>
 |       <svg g>
 |         xlink href="foo"
 |         xml lang="en"
 |       "bar"
 #data
 <svg></path>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <div><svg></div>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <svg svg>
 |     "a"
 #data
 <div><svg><path></div>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <svg svg>
 |         <svg path>
 |     "a"
 #data
 <div><svg><path></svg><path>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <svg svg>
 |         <svg path>
 |       <path>
 #data
 <div><svg><path><foreignObject><math></div>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <svg svg>
 |         <svg path>
 |           <svg foreignObject>
 |             <math math>
 |               "a"
 #data
 <div><svg><path><foreignObject><p></div>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <svg svg>
 |         <svg path>
 |           <svg foreignObject>
 |             <p>
 |               "a"
 #data
 <!DOCTYPE html><svg><desc><div><svg><ul>a
 #errors
 40: HTML start tag “ul” in a foreign namespace context.
 41: End of file in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg desc>
 |         <div>
 |           <svg svg>
 |           <ul>
 |             "a"
 #data
 <!DOCTYPE html><svg><desc><svg><ul>a
 #errors
 35: HTML start tag “ul” in a foreign namespace context.
 36: End of file in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg desc>
 |         <svg svg>
 |         <ul>
 |           "a"
 #data
 <!DOCTYPE html><p><svg><desc><p>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <svg svg>
 |         <svg desc>
 |           <p>
 #data
 <!DOCTYPE html><p><svg><title><p>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <svg svg>
 |         <svg title>
 |           <p>
 #data
 <div><svg><path><foreignObject><p></foreignObject><p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <svg svg>
 |         <svg path>
 |           <svg foreignObject>
 |             <p>
 |             <p>
 #data
 <math><mi><div><object><div><span></span></div></object></div></mi><mi>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         <div>
 |           <object>
 |             <div>
 |               <span>
 |       <math mi>
 #data
 <math><mi><svg><foreignObject><div><div></div></div></foreignObject></svg></mi><mi>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         <svg svg>
 |           <svg foreignObject>
 |             <div>
 |               <div>
 |       <math mi>
 #data
 <svg><script></script><path>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg script>
 |       <svg path>
 #data
 <table><svg></svg><tr>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <math><mi><mglyph>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         <math mglyph>
 #data
 <math><mi><malignmark>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         <math malignmark>
 #data
 <math><mo><mglyph>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mo>
 |         <math mglyph>
 #data
 <math><mo><malignmark>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mo>
 |         <math malignmark>
 #data
 <math><mn><mglyph>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mn>
 |         <math mglyph>
 #data
 <math><mn><malignmark>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mn>
 |         <math malignmark>
 #data
 <math><ms><mglyph>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math ms>
 |         <math mglyph>
 #data
 <math><ms><malignmark>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math ms>
 |         <math malignmark>
 #data
 <math><mtext><mglyph>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mtext>
 |         <math mglyph>
 #data
 <math><mtext><malignmark>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mtext>
 |         <math malignmark>
 #data
 <math><annotation-xml><svg></svg></annotation-xml><mi>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         <svg svg>
 |       <math mi>
 #data
 <math><annotation-xml><svg><foreignObject><div><math><mi></mi></math><span></span></div></foreignObject><path></path></svg></annotation-xml><mi>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         <svg svg>
 |           <svg foreignObject>
 |             <div>
 |               <math math>
 |                 <math mi>
 |               <span>
 |           <svg path>
 |       <math mi>
 #data
 <math><annotation-xml><svg><foreignObject><math><mi><svg></svg></mi><mo></mo></math><span></span></foreignObject><path></path></svg></annotation-xml><mi>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         <svg svg>
 |           <svg foreignObject>
 |             <math math>
 |               <math mi>
 |                 <svg svg>
 |               <math mo>
 |             <span>
 |           <svg path>
 |       <math mi>
--- a/lib/html5lib/tests/testdata/tree-construction/tests11.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests11.dat
@ -1,482 +0,0 @@
 #data
 <!DOCTYPE html><body><svg attributeName='' attributeType='' baseFrequency='' baseProfile='' calcMode='' clipPathUnits='' contentScriptType='' contentStyleType='' diffuseConstant='' edgeMode='' externalResourcesRequired='' filterRes='' filterUnits='' glyphRef='' gradientTransform='' gradientUnits='' kernelMatrix='' kernelUnitLength='' keyPoints='' keySplines='' keyTimes='' lengthAdjust='' limitingConeAngle='' markerHeight='' markerUnits='' markerWidth='' maskContentUnits='' maskUnits='' numOctaves='' pathLength='' patternContentUnits='' patternTransform='' patternUnits='' pointsAtX='' pointsAtY='' pointsAtZ='' preserveAlpha='' preserveAspectRatio='' primitiveUnits='' refX='' refY='' repeatCount='' repeatDur='' requiredExtensions='' requiredFeatures='' specularConstant='' specularExponent='' spreadMethod='' startOffset='' stdDeviation='' stitchTiles='' surfaceScale='' systemLanguage='' tableValues='' targetX='' targetY='' textLength='' viewBox='' viewTarget='' xChannelSelector='' yChannelSelector='' zoomAndPan=''></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       attributeName=""
 |       attributeType=""
 |       baseFrequency=""
 |       baseProfile=""
 |       calcMode=""
 |       clipPathUnits=""
 |       contentScriptType=""
 |       contentStyleType=""
 |       diffuseConstant=""
 |       edgeMode=""
 |       externalResourcesRequired=""
 |       filterRes=""
 |       filterUnits=""
 |       glyphRef=""
 |       gradientTransform=""
 |       gradientUnits=""
 |       kernelMatrix=""
 |       kernelUnitLength=""
 |       keyPoints=""
 |       keySplines=""
 |       keyTimes=""
 |       lengthAdjust=""
 |       limitingConeAngle=""
 |       markerHeight=""
 |       markerUnits=""
 |       markerWidth=""
 |       maskContentUnits=""
 |       maskUnits=""
 |       numOctaves=""
 |       pathLength=""
 |       patternContentUnits=""
 |       patternTransform=""
 |       patternUnits=""
 |       pointsAtX=""
 |       pointsAtY=""
 |       pointsAtZ=""
 |       preserveAlpha=""
 |       preserveAspectRatio=""
 |       primitiveUnits=""
 |       refX=""
 |       refY=""
 |       repeatCount=""
 |       repeatDur=""
 |       requiredExtensions=""
 |       requiredFeatures=""
 |       specularConstant=""
 |       specularExponent=""
 |       spreadMethod=""
 |       startOffset=""
 |       stdDeviation=""
 |       stitchTiles=""
 |       surfaceScale=""
 |       systemLanguage=""
 |       tableValues=""
 |       targetX=""
 |       targetY=""
 |       textLength=""
 |       viewBox=""
 |       viewTarget=""
 |       xChannelSelector=""
 |       yChannelSelector=""
 |       zoomAndPan=""
 #data
 <!DOCTYPE html><BODY><SVG ATTRIBUTENAME='' ATTRIBUTETYPE='' BASEFREQUENCY='' BASEPROFILE='' CALCMODE='' CLIPPATHUNITS='' CONTENTSCRIPTTYPE='' CONTENTSTYLETYPE='' DIFFUSECONSTANT='' EDGEMODE='' EXTERNALRESOURCESREQUIRED='' FILTERRES='' FILTERUNITS='' GLYPHREF='' GRADIENTTRANSFORM='' GRADIENTUNITS='' KERNELMATRIX='' KERNELUNITLENGTH='' KEYPOINTS='' KEYSPLINES='' KEYTIMES='' LENGTHADJUST='' LIMITINGCONEANGLE='' MARKERHEIGHT='' MARKERUNITS='' MARKERWIDTH='' MASKCONTENTUNITS='' MASKUNITS='' NUMOCTAVES='' PATHLENGTH='' PATTERNCONTENTUNITS='' PATTERNTRANSFORM='' PATTERNUNITS='' POINTSATX='' POINTSATY='' POINTSATZ='' PRESERVEALPHA='' PRESERVEASPECTRATIO='' PRIMITIVEUNITS='' REFX='' REFY='' REPEATCOUNT='' REPEATDUR='' REQUIREDEXTENSIONS='' REQUIREDFEATURES='' SPECULARCONSTANT='' SPECULAREXPONENT='' SPREADMETHOD='' STARTOFFSET='' STDDEVIATION='' STITCHTILES='' SURFACESCALE='' SYSTEMLANGUAGE='' TABLEVALUES='' TARGETX='' TARGETY='' TEXTLENGTH='' VIEWBOX='' VIEWTARGET='' XCHANNELSELECTOR='' YCHANNELSELECTOR='' ZOOMANDPAN=''></SVG>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       attributeName=""
 |       attributeType=""
 |       baseFrequency=""
 |       baseProfile=""
 |       calcMode=""
 |       clipPathUnits=""
 |       contentScriptType=""
 |       contentStyleType=""
 |       diffuseConstant=""
 |       edgeMode=""
 |       externalResourcesRequired=""
 |       filterRes=""
 |       filterUnits=""
 |       glyphRef=""
 |       gradientTransform=""
 |       gradientUnits=""
 |       kernelMatrix=""
 |       kernelUnitLength=""
 |       keyPoints=""
 |       keySplines=""
 |       keyTimes=""
 |       lengthAdjust=""
 |       limitingConeAngle=""
 |       markerHeight=""
 |       markerUnits=""
 |       markerWidth=""
 |       maskContentUnits=""
 |       maskUnits=""
 |       numOctaves=""
 |       pathLength=""
 |       patternContentUnits=""
 |       patternTransform=""
 |       patternUnits=""
 |       pointsAtX=""
 |       pointsAtY=""
 |       pointsAtZ=""
 |       preserveAlpha=""
 |       preserveAspectRatio=""
 |       primitiveUnits=""
 |       refX=""
 |       refY=""
 |       repeatCount=""
 |       repeatDur=""
 |       requiredExtensions=""
 |       requiredFeatures=""
 |       specularConstant=""
 |       specularExponent=""
 |       spreadMethod=""
 |       startOffset=""
 |       stdDeviation=""
 |       stitchTiles=""
 |       surfaceScale=""
 |       systemLanguage=""
 |       tableValues=""
 |       targetX=""
 |       targetY=""
 |       textLength=""
 |       viewBox=""
 |       viewTarget=""
 |       xChannelSelector=""
 |       yChannelSelector=""
 |       zoomAndPan=""
 #data
 <!DOCTYPE html><body><svg attributename='' attributetype='' basefrequency='' baseprofile='' calcmode='' clippathunits='' contentscripttype='' contentstyletype='' diffuseconstant='' edgemode='' externalresourcesrequired='' filterres='' filterunits='' glyphref='' gradienttransform='' gradientunits='' kernelmatrix='' kernelunitlength='' keypoints='' keysplines='' keytimes='' lengthadjust='' limitingconeangle='' markerheight='' markerunits='' markerwidth='' maskcontentunits='' maskunits='' numoctaves='' pathlength='' patterncontentunits='' patterntransform='' patternunits='' pointsatx='' pointsaty='' pointsatz='' preservealpha='' preserveaspectratio='' primitiveunits='' refx='' refy='' repeatcount='' repeatdur='' requiredextensions='' requiredfeatures='' specularconstant='' specularexponent='' spreadmethod='' startoffset='' stddeviation='' stitchtiles='' surfacescale='' systemlanguage='' tablevalues='' targetx='' targety='' textlength='' viewbox='' viewtarget='' xchannelselector='' ychannelselector='' zoomandpan=''></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       attributeName=""
 |       attributeType=""
 |       baseFrequency=""
 |       baseProfile=""
 |       calcMode=""
 |       clipPathUnits=""
 |       contentScriptType=""
 |       contentStyleType=""
 |       diffuseConstant=""
 |       edgeMode=""
 |       externalResourcesRequired=""
 |       filterRes=""
 |       filterUnits=""
 |       glyphRef=""
 |       gradientTransform=""
 |       gradientUnits=""
 |       kernelMatrix=""
 |       kernelUnitLength=""
 |       keyPoints=""
 |       keySplines=""
 |       keyTimes=""
 |       lengthAdjust=""
 |       limitingConeAngle=""
 |       markerHeight=""
 |       markerUnits=""
 |       markerWidth=""
 |       maskContentUnits=""
 |       maskUnits=""
 |       numOctaves=""
 |       pathLength=""
 |       patternContentUnits=""
 |       patternTransform=""
 |       patternUnits=""
 |       pointsAtX=""
 |       pointsAtY=""
 |       pointsAtZ=""
 |       preserveAlpha=""
 |       preserveAspectRatio=""
 |       primitiveUnits=""
 |       refX=""
 |       refY=""
 |       repeatCount=""
 |       repeatDur=""
 |       requiredExtensions=""
 |       requiredFeatures=""
 |       specularConstant=""
 |       specularExponent=""
 |       spreadMethod=""
 |       startOffset=""
 |       stdDeviation=""
 |       stitchTiles=""
 |       surfaceScale=""
 |       systemLanguage=""
 |       tableValues=""
 |       targetX=""
 |       targetY=""
 |       textLength=""
 |       viewBox=""
 |       viewTarget=""
 |       xChannelSelector=""
 |       yChannelSelector=""
 |       zoomAndPan=""
 #data
 <!DOCTYPE html><body><math attributeName='' attributeType='' baseFrequency='' baseProfile='' calcMode='' clipPathUnits='' contentScriptType='' contentStyleType='' diffuseConstant='' edgeMode='' externalResourcesRequired='' filterRes='' filterUnits='' glyphRef='' gradientTransform='' gradientUnits='' kernelMatrix='' kernelUnitLength='' keyPoints='' keySplines='' keyTimes='' lengthAdjust='' limitingConeAngle='' markerHeight='' markerUnits='' markerWidth='' maskContentUnits='' maskUnits='' numOctaves='' pathLength='' patternContentUnits='' patternTransform='' patternUnits='' pointsAtX='' pointsAtY='' pointsAtZ='' preserveAlpha='' preserveAspectRatio='' primitiveUnits='' refX='' refY='' repeatCount='' repeatDur='' requiredExtensions='' requiredFeatures='' specularConstant='' specularExponent='' spreadMethod='' startOffset='' stdDeviation='' stitchTiles='' surfaceScale='' systemLanguage='' tableValues='' targetX='' targetY='' textLength='' viewBox='' viewTarget='' xChannelSelector='' yChannelSelector='' zoomAndPan=''></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       attributename=""
 |       attributetype=""
 |       basefrequency=""
 |       baseprofile=""
 |       calcmode=""
 |       clippathunits=""
 |       contentscripttype=""
 |       contentstyletype=""
 |       diffuseconstant=""
 |       edgemode=""
 |       externalresourcesrequired=""
 |       filterres=""
 |       filterunits=""
 |       glyphref=""
 |       gradienttransform=""
 |       gradientunits=""
 |       kernelmatrix=""
 |       kernelunitlength=""
 |       keypoints=""
 |       keysplines=""
 |       keytimes=""
 |       lengthadjust=""
 |       limitingconeangle=""
 |       markerheight=""
 |       markerunits=""
 |       markerwidth=""
 |       maskcontentunits=""
 |       maskunits=""
 |       numoctaves=""
 |       pathlength=""
 |       patterncontentunits=""
 |       patterntransform=""
 |       patternunits=""
 |       pointsatx=""
 |       pointsaty=""
 |       pointsatz=""
 |       preservealpha=""
 |       preserveaspectratio=""
 |       primitiveunits=""
 |       refx=""
 |       refy=""
 |       repeatcount=""
 |       repeatdur=""
 |       requiredextensions=""
 |       requiredfeatures=""
 |       specularconstant=""
 |       specularexponent=""
 |       spreadmethod=""
 |       startoffset=""
 |       stddeviation=""
 |       stitchtiles=""
 |       surfacescale=""
 |       systemlanguage=""
 |       tablevalues=""
 |       targetx=""
 |       targety=""
 |       textlength=""
 |       viewbox=""
 |       viewtarget=""
 |       xchannelselector=""
 |       ychannelselector=""
 |       zoomandpan=""
 #data
 <!DOCTYPE html><body><svg><altGlyph /><altGlyphDef /><altGlyphItem /><animateColor /><animateMotion /><animateTransform /><clipPath /><feBlend /><feColorMatrix /><feComponentTransfer /><feComposite /><feConvolveMatrix /><feDiffuseLighting /><feDisplacementMap /><feDistantLight /><feFlood /><feFuncA /><feFuncB /><feFuncG /><feFuncR /><feGaussianBlur /><feImage /><feMerge /><feMergeNode /><feMorphology /><feOffset /><fePointLight /><feSpecularLighting /><feSpotLight /><feTile /><feTurbulence /><foreignObject /><glyphRef /><linearGradient /><radialGradient /><textPath /></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg altGlyph>
 |       <svg altGlyphDef>
 |       <svg altGlyphItem>
 |       <svg animateColor>
 |       <svg animateMotion>
 |       <svg animateTransform>
 |       <svg clipPath>
 |       <svg feBlend>
 |       <svg feColorMatrix>
 |       <svg feComponentTransfer>
 |       <svg feComposite>
 |       <svg feConvolveMatrix>
 |       <svg feDiffuseLighting>
 |       <svg feDisplacementMap>
 |       <svg feDistantLight>
 |       <svg feFlood>
 |       <svg feFuncA>
 |       <svg feFuncB>
 |       <svg feFuncG>
 |       <svg feFuncR>
 |       <svg feGaussianBlur>
 |       <svg feImage>
 |       <svg feMerge>
 |       <svg feMergeNode>
 |       <svg feMorphology>
 |       <svg feOffset>
 |       <svg fePointLight>
 |       <svg feSpecularLighting>
 |       <svg feSpotLight>
 |       <svg feTile>
 |       <svg feTurbulence>
 |       <svg foreignObject>
 |       <svg glyphRef>
 |       <svg linearGradient>
 |       <svg radialGradient>
 |       <svg textPath>
 #data
 <!DOCTYPE html><body><svg><altglyph /><altglyphdef /><altglyphitem /><animatecolor /><animatemotion /><animatetransform /><clippath /><feblend /><fecolormatrix /><fecomponenttransfer /><fecomposite /><feconvolvematrix /><fediffuselighting /><fedisplacementmap /><fedistantlight /><feflood /><fefunca /><fefuncb /><fefuncg /><fefuncr /><fegaussianblur /><feimage /><femerge /><femergenode /><femorphology /><feoffset /><fepointlight /><fespecularlighting /><fespotlight /><fetile /><feturbulence /><foreignobject /><glyphref /><lineargradient /><radialgradient /><textpath /></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg altGlyph>
 |       <svg altGlyphDef>
 |       <svg altGlyphItem>
 |       <svg animateColor>
 |       <svg animateMotion>
 |       <svg animateTransform>
 |       <svg clipPath>
 |       <svg feBlend>
 |       <svg feColorMatrix>
 |       <svg feComponentTransfer>
 |       <svg feComposite>
 |       <svg feConvolveMatrix>
 |       <svg feDiffuseLighting>
 |       <svg feDisplacementMap>
 |       <svg feDistantLight>
 |       <svg feFlood>
 |       <svg feFuncA>
 |       <svg feFuncB>
 |       <svg feFuncG>
 |       <svg feFuncR>
 |       <svg feGaussianBlur>
 |       <svg feImage>
 |       <svg feMerge>
 |       <svg feMergeNode>
 |       <svg feMorphology>
 |       <svg feOffset>
 |       <svg fePointLight>
 |       <svg feSpecularLighting>
 |       <svg feSpotLight>
 |       <svg feTile>
 |       <svg feTurbulence>
 |       <svg foreignObject>
 |       <svg glyphRef>
 |       <svg linearGradient>
 |       <svg radialGradient>
 |       <svg textPath>
 #data
 <!DOCTYPE html><BODY><SVG><ALTGLYPH /><ALTGLYPHDEF /><ALTGLYPHITEM /><ANIMATECOLOR /><ANIMATEMOTION /><ANIMATETRANSFORM /><CLIPPATH /><FEBLEND /><FECOLORMATRIX /><FECOMPONENTTRANSFER /><FECOMPOSITE /><FECONVOLVEMATRIX /><FEDIFFUSELIGHTING /><FEDISPLACEMENTMAP /><FEDISTANTLIGHT /><FEFLOOD /><FEFUNCA /><FEFUNCB /><FEFUNCG /><FEFUNCR /><FEGAUSSIANBLUR /><FEIMAGE /><FEMERGE /><FEMERGENODE /><FEMORPHOLOGY /><FEOFFSET /><FEPOINTLIGHT /><FESPECULARLIGHTING /><FESPOTLIGHT /><FETILE /><FETURBULENCE /><FOREIGNOBJECT /><GLYPHREF /><LINEARGRADIENT /><RADIALGRADIENT /><TEXTPATH /></SVG>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg altGlyph>
 |       <svg altGlyphDef>
 |       <svg altGlyphItem>
 |       <svg animateColor>
 |       <svg animateMotion>
 |       <svg animateTransform>
 |       <svg clipPath>
 |       <svg feBlend>
 |       <svg feColorMatrix>
 |       <svg feComponentTransfer>
 |       <svg feComposite>
 |       <svg feConvolveMatrix>
 |       <svg feDiffuseLighting>
 |       <svg feDisplacementMap>
 |       <svg feDistantLight>
 |       <svg feFlood>
 |       <svg feFuncA>
 |       <svg feFuncB>
 |       <svg feFuncG>
 |       <svg feFuncR>
 |       <svg feGaussianBlur>
 |       <svg feImage>
 |       <svg feMerge>
 |       <svg feMergeNode>
 |       <svg feMorphology>
 |       <svg feOffset>
 |       <svg fePointLight>
 |       <svg feSpecularLighting>
 |       <svg feSpotLight>
 |       <svg feTile>
 |       <svg feTurbulence>
 |       <svg foreignObject>
 |       <svg glyphRef>
 |       <svg linearGradient>
 |       <svg radialGradient>
 |       <svg textPath>
 #data
 <!DOCTYPE html><body><math><altGlyph /><altGlyphDef /><altGlyphItem /><animateColor /><animateMotion /><animateTransform /><clipPath /><feBlend /><feColorMatrix /><feComponentTransfer /><feComposite /><feConvolveMatrix /><feDiffuseLighting /><feDisplacementMap /><feDistantLight /><feFlood /><feFuncA /><feFuncB /><feFuncG /><feFuncR /><feGaussianBlur /><feImage /><feMerge /><feMergeNode /><feMorphology /><feOffset /><fePointLight /><feSpecularLighting /><feSpotLight /><feTile /><feTurbulence /><foreignObject /><glyphRef /><linearGradient /><radialGradient /><textPath /></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math altglyph>
 |       <math altglyphdef>
 |       <math altglyphitem>
 |       <math animatecolor>
 |       <math animatemotion>
 |       <math animatetransform>
 |       <math clippath>
 |       <math feblend>
 |       <math fecolormatrix>
 |       <math fecomponenttransfer>
 |       <math fecomposite>
 |       <math feconvolvematrix>
 |       <math fediffuselighting>
 |       <math fedisplacementmap>
 |       <math fedistantlight>
 |       <math feflood>
 |       <math fefunca>
 |       <math fefuncb>
 |       <math fefuncg>
 |       <math fefuncr>
 |       <math fegaussianblur>
 |       <math feimage>
 |       <math femerge>
 |       <math femergenode>
 |       <math femorphology>
 |       <math feoffset>
 |       <math fepointlight>
 |       <math fespecularlighting>
 |       <math fespotlight>
 |       <math fetile>
 |       <math feturbulence>
 |       <math foreignobject>
 |       <math glyphref>
 |       <math lineargradient>
 |       <math radialgradient>
 |       <math textpath>
 #data
 <!DOCTYPE html><body><svg><solidColor /></svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg solidcolor>
--- a/lib/html5lib/tests/testdata/tree-construction/tests12.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests12.dat
@ -1,62 +0,0 @@
 #data
 <!DOCTYPE html><body><p>foo<math><mtext><i>baz</i></mtext><annotation-xml><svg><desc><b>eggs</b></desc><g><foreignObject><P>spam<TABLE><tr><td><img></td></table></foreignObject></g><g>quux</g></svg></annotation-xml></math>bar
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       "foo"
 |       <math math>
 |         <math mtext>
 |           <i>
 |             "baz"
 |         <math annotation-xml>
 |           <svg svg>
 |             <svg desc>
 |               <b>
 |                 "eggs"
 |             <svg g>
 |               <svg foreignObject>
 |                 <p>
 |                   "spam"
 |                 <table>
 |                   <tbody>
 |                     <tr>
 |                       <td>
 |                         <img>
 |             <svg g>
 |               "quux"
 |       "bar"
 #data
 <!DOCTYPE html><body>foo<math><mtext><i>baz</i></mtext><annotation-xml><svg><desc><b>eggs</b></desc><g><foreignObject><P>spam<TABLE><tr><td><img></td></table></foreignObject></g><g>quux</g></svg></annotation-xml></math>bar
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "foo"
 |     <math math>
 |       <math mtext>
 |         <i>
 |           "baz"
 |       <math annotation-xml>
 |         <svg svg>
 |           <svg desc>
 |             <b>
 |               "eggs"
 |           <svg g>
 |             <svg foreignObject>
 |               <p>
 |                 "spam"
 |               <table>
 |                 <tbody>
 |                   <tr>
 |                     <td>
 |                       <img>
 |           <svg g>
 |             "quux"
 |     "bar"
--- a/lib/html5lib/tests/testdata/tree-construction/tests14.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests14.dat
@ -1,74 +0,0 @@
 #data
 <!DOCTYPE html><html><body><xyz:abc></xyz:abc>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <xyz:abc>
 #data
 <!DOCTYPE html><html><body><xyz:abc></xyz:abc><span></span>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <xyz:abc>
 |     <span>
 #data
 <!DOCTYPE html><html><html abc:def=gh><xyz:abc></xyz:abc>
 #errors
 15: Unexpected start tag html
 #document
 | <!DOCTYPE html>
 | <html>
 |   abc:def="gh"
 |   <head>
 |   <body>
 |     <xyz:abc>
 #data
 <!DOCTYPE html><html xml:lang=bar><html xml:lang=foo>
 #errors
 15: Unexpected start tag html
 #document
 | <!DOCTYPE html>
 | <html>
 |   xml:lang="bar"
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><html 123=456>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   123="456"
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><html 123=456><html 789=012>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   123="456"
 |   789="012"
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><html><body 789=012>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     789="012"
--- a/lib/html5lib/tests/testdata/tree-construction/tests15.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests15.dat
@ -1,208 +0,0 @@
 #data
 <!DOCTYPE html><p><b><i><u></p> <p>X
 #errors
 Line: 1 Col: 31 Unexpected end tag (p). Ignored.
 Line: 1 Col: 36 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <b>
 |         <i>
 |           <u>
 |     <b>
 |       <i>
 |         <u>
 |           " "
 |           <p>
 |             "X"
 #data
 <p><b><i><u></p>
 <p>X
 #errors
 Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE.
 Line: 1 Col: 16 Unexpected end tag (p). Ignored.
 Line: 2 Col: 4 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <b>
 |         <i>
 |           <u>
 |     <b>
 |       <i>
 |         <u>
 |           "
 "
 |           <p>
 |             "X"
 #data
 <!doctype html></html> <head>
 #errors
 Line: 1 Col: 22 Unexpected end tag (html) after the (implied) root element.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     " "
 #data
 <!doctype html></body><meta>
 #errors
 Line: 1 Col: 22 Unexpected end tag (body) after the (implied) root element.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <meta>
 #data
 <html></html><!-- foo -->
 #errors
 Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE.
 Line: 1 Col: 13 Unexpected end tag (html) after the (implied) root element.
 #document
 | <html>
 |   <head>
 |   <body>
 | <!--  foo  -->
 #data
 <!doctype html></body><title>X</title>
 #errors
 Line: 1 Col: 22 Unexpected end tag (body) after the (implied) root element.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <title>
 |       "X"
 #data
 <!doctype html><table> X<meta></table>
 #errors
 Line: 1 Col: 24 Unexpected non-space characters in table context caused voodoo mode.
 Line: 1 Col: 30 Unexpected start tag (meta) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     " X"
 |     <meta>
 |     <table>
 #data
 <!doctype html><table> x</table>
 #errors
 Line: 1 Col: 24 Unexpected non-space characters in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     " x"
 |     <table>
 #data
 <!doctype html><table> x </table>
 #errors
 Line: 1 Col: 25 Unexpected non-space characters in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     " x "
 |     <table>
 #data
 <!doctype html><table><tr> x</table>
 #errors
 Line: 1 Col: 28 Unexpected non-space characters in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     " x"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><table>X<style> <tr>x </style> </table>
 #errors
 Line: 1 Col: 23 Unexpected non-space characters in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X"
 |     <table>
 |       <style>
 |         " <tr>x "
 |       " "
 #data
 <!doctype html><div><table><a>foo</a> <tr><td>bar</td> </tr></table></div>
 #errors
 Line: 1 Col: 30 Unexpected start tag (a) in table context caused voodoo mode.
 Line: 1 Col: 37 Unexpected end tag (a) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <a>
 |         "foo"
 |       <table>
 |         " "
 |         <tbody>
 |           <tr>
 |             <td>
 |               "bar"
 |             " "
 #data
 <frame></frame></frame><frameset><frame><frameset><frame></frameset><noframes></frameset><noframes>
 #errors
 6: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
 13: Stray start tag “frame”.
 21: Stray end tag “frame”.
 29: Stray end tag “frame”.
 39: “frameset” start tag after “body” already open.
 105: End of file seen inside an [R]CDATA element.
 105: End of file seen and there were open elements.
 XXX: These errors are wrong, please fix me!
 #document
 | <html>
 |   <head>
 |   <frameset>
 |     <frame>
 |     <frameset>
 |       <frame>
 |     <noframes>
 |       "</frameset><noframes>"
 #data
 <!DOCTYPE html><object></html>
 #errors
 1: Expected closing tag. Unexpected end of file
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <object>
--- a/lib/html5lib/tests/testdata/tree-construction/tests16.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests16.dat
--- a/lib/html5lib/tests/testdata/tree-construction/tests17.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests17.dat
@ -1,153 +0,0 @@
 #data
 <!doctype html><table><tbody><select><tr>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><table><tr><select><td>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <!doctype html><table><tr><td><select><td>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <select>
 |           <td>
 #data
 <!doctype html><table><tr><th><select><td>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <th>
 |             <select>
 |           <td>
 #data
 <!doctype html><table><caption><select><tr>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <select>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><select><tr>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><select><td>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><select><th>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><select><tbody>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><select><thead>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><select><tfoot>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><select><caption>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!doctype html><table><tr></table>a
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |     "a"
--- a/lib/html5lib/tests/testdata/tree-construction/tests18.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests18.dat
@ -1,269 +0,0 @@
 #data
 <!doctype html><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "</plaintext>"
 #data
 <!doctype html><table><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "</plaintext>"
 |     <table>
 #data
 <!doctype html><table><tbody><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "</plaintext>"
 |     <table>
 |       <tbody>
 #data
 <!doctype html><table><tbody><tr><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "</plaintext>"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><table><tbody><tr><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "</plaintext>"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><table><td><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <plaintext>
 |               "</plaintext>"
 #data
 <!doctype html><table><caption><plaintext></plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <plaintext>
 |           "</plaintext>"
 #data
 <!doctype html><table><tr><style></script></style>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "abc"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <style>
 |             "</script>"
 #data
 <!doctype html><table><tr><script></style></script>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "abc"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <script>
 |             "</style>"
 #data
 <!doctype html><table><caption><style></script></style>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <style>
 |           "</script>"
 |         "abc"
 #data
 <!doctype html><table><td><style></script></style>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <style>
 |               "</script>"
 |             "abc"
 #data
 <!doctype html><select><script></style></script>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <script>
 |         "</style>"
 |       "abc"
 #data
 <!doctype html><table><select><script></style></script>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <script>
 |         "</style>"
 |       "abc"
 |     <table>
 #data
 <!doctype html><table><tr><select><script></style></script>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <script>
 |         "</style>"
 |       "abc"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><frameset></frameset><noframes>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 |   <noframes>
 |     "abc"
 #data
 <!doctype html><frameset></frameset><noframes>abc</noframes><!--abc-->
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 |   <noframes>
 |     "abc"
 |   <!-- abc -->
 #data
 <!doctype html><frameset></frameset></html><noframes>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 |   <noframes>
 |     "abc"
 #data
 <!doctype html><frameset></frameset></html><noframes>abc</noframes><!--abc-->
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 |   <noframes>
 |     "abc"
 | <!-- abc -->
 #data
 <!doctype html><table><tr></tbody><tfoot>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |       <tfoot>
 #data
 <!doctype html><table><td><svg></svg>abc<td>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <svg svg>
 |             "abc"
 |           <td>
--- a/lib/html5lib/tests/testdata/tree-construction/tests19.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests19.dat
--- a/lib/html5lib/tests/testdata/tree-construction/tests2.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests2.dat
@ -1,763 +0,0 @@
 #data
 <!DOCTYPE html>Test
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "Test"
 #data
 <textarea>test</div>test
 #errors
 Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE.
 Line: 1 Col: 24 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       "test</div>test"
 #data
 <table><td>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 11 Unexpected table cell start tag (td) in the table body phase.
 Line: 1 Col: 11 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <table><td>test</tbody></table>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 11 Unexpected table cell start tag (td) in the table body phase.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "test"
 #data
 <frame>test
 #errors
 Line: 1 Col: 7 Unexpected start tag (frame). Expected DOCTYPE.
 Line: 1 Col: 7 Unexpected start tag frame. Ignored.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "test"
 #data
 <!DOCTYPE html><frameset>test
 #errors
 Line: 1 Col: 29 Unepxected characters in the frameset phase. Characters ignored.
 Line: 1 Col: 29 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><frameset><!DOCTYPE html>
 #errors
 Line: 1 Col: 40 Unexpected DOCTYPE. Ignored.
 Line: 1 Col: 40 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><font><p><b>test</font>
 #errors
 Line: 1 Col: 38 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm.
 Line: 1 Col: 38 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <font>
 |     <p>
 |       <font>
 |         <b>
 |           "test"
 #data
 <!DOCTYPE html><dt><div><dd>
 #errors
 Line: 1 Col: 28 Missing end tag (div, dt).
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <dt>
 |       <div>
 |     <dd>
 #data
 <script></x
 #errors
 Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE.
 Line: 1 Col: 11 Unexpected end of file. Expected end tag (script).
 #document
 | <html>
 |   <head>
 |     <script>
 |       "</x"
 |   <body>
 #data
 <table><plaintext><td>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 18 Unexpected start tag (plaintext) in table context caused voodoo mode.
 Line: 1 Col: 22 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "<td>"
 |     <table>
 #data
 <plaintext></plaintext>
 #errors
 Line: 1 Col: 11 Unexpected start tag (plaintext). Expected DOCTYPE.
 Line: 1 Col: 23 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <plaintext>
 |       "</plaintext>"
 #data
 <!DOCTYPE html><table><tr>TEST
 #errors
 Line: 1 Col: 30 Unexpected non-space characters in table context caused voodoo mode.
 Line: 1 Col: 30 Unexpected end of file. Expected table content.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "TEST"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!DOCTYPE html><body t1=1><body t2=2><body t3=3 t4=4>
 #errors
 Line: 1 Col: 37 Unexpected start tag (body).
 Line: 1 Col: 53 Unexpected start tag (body).
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     t1="1"
 |     t2="2"
 |     t3="3"
 |     t4="4"
 #data
 </b test
 #errors
 Line: 1 Col: 8 Unexpected end of file in attribute name.
 Line: 1 Col: 8 End tag contains unexpected attributes.
 Line: 1 Col: 8 Unexpected end tag (b). Expected DOCTYPE.
 Line: 1 Col: 8 Unexpected end tag (b) after the (implied) root element.
 #document
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html></b test<b &=&amp>X
 #errors
 Line: 1 Col: 32 Named entity didn't end with ';'.
 Line: 1 Col: 33 End tag contains unexpected attributes.
 Line: 1 Col: 33 Unexpected end tag (b) after the (implied) root element.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X"
 #data
 <!doctypehtml><scrIPt type=text/x-foobar;baz>X</SCRipt
 #errors
 Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
 Line: 1 Col: 54 Unexpected end of file in the tag name.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <script>
 |       type="text/x-foobar;baz"
 |       "X</SCRipt"
 |   <body>
 #data
 &
 #errors
 Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&"
 #data
 &#
 #errors
 Line: 1 Col: 1 Numeric entity expected. Got end of file instead.
 Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&#"
 #data
 &#X
 #errors
 Line: 1 Col: 3 Numeric entity expected but none found.
 Line: 1 Col: 3 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&#X"
 #data
 &#x
 #errors
 Line: 1 Col: 3 Numeric entity expected but none found.
 Line: 1 Col: 3 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&#x"
 #data
 &#45
 #errors
 Line: 1 Col: 4 Numeric entity didn't end with ';'.
 Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "-"
 #data
 &x-test
 #errors
 Line: 1 Col: 1 Named entity expected. Got none.
 Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&x-test"
 #data
 <!doctypehtml><p><li>
 #errors
 Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <li>
 #data
 <!doctypehtml><p><dt>
 #errors
 Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <dt>
 #data
 <!doctypehtml><p><dd>
 #errors
 Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <dd>
 #data
 <!doctypehtml><p><form>
 #errors
 Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
 Line: 1 Col: 23 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <form>
 #data
 <!DOCTYPE html><p></P>X
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     "X"
 #data
 &AMP
 #errors
 Line: 1 Col: 4 Named entity didn't end with ';'.
 Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&"
 #data
 &AMp;
 #errors
 Line: 1 Col: 1 Named entity expected. Got none.
 Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "&AMp;"
 #data
 <!DOCTYPE html><html><head></head><body><thisISasillyTESTelementNameToMakeSureCrazyTagNamesArePARSEDcorrectLY>
 #errors
 Line: 1 Col: 110 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <thisisasillytestelementnametomakesurecrazytagnamesareparsedcorrectly>
 #data
 <!DOCTYPE html>X</body>X
 #errors
 Line: 1 Col: 24 Unexpected non-space characters in the after body phase.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "XX"
 #data
 <!DOCTYPE html><!-- X
 #errors
 Line: 1 Col: 21 Unexpected end of file in comment.
 #document
 | <!DOCTYPE html>
 | <!--  X -->
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><table><caption>test TEST</caption><td>test
 #errors
 Line: 1 Col: 54 Unexpected table cell start tag (td) in the table body phase.
 Line: 1 Col: 58 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         "test TEST"
 |       <tbody>
 |         <tr>
 |           <td>
 |             "test"
 #data
 <!DOCTYPE html><select><option><optgroup>
 #errors
 Line: 1 Col: 41 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <option>
 |       <optgroup>
 #data
 <!DOCTYPE html><select><optgroup><option></optgroup><option><select><option>
 #errors
 Line: 1 Col: 68 Unexpected select start tag in the select phase treated as select end tag.
 Line: 1 Col: 76 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <optgroup>
 |         <option>
 |       <option>
 |     <option>
 #data
 <!DOCTYPE html><select><optgroup><option><optgroup>
 #errors
 Line: 1 Col: 51 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <optgroup>
 |         <option>
 |       <optgroup>
 #data
 <!DOCTYPE html><datalist><option>foo</datalist>bar
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <datalist>
 |       <option>
 |         "foo"
 |     "bar"
 #data
 <!DOCTYPE html><font><input><input></font>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <font>
 |       <input>
 |       <input>
 #data
 <!DOCTYPE html><!-- XXX - XXX -->
 #errors
 #document
 | <!DOCTYPE html>
 | <!--  XXX - XXX  -->
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><!-- XXX - XXX
 #errors
 Line: 1 Col: 29 Unexpected end of file in comment (-)
 #document
 | <!DOCTYPE html>
 | <!--  XXX - XXX -->
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><!-- XXX - XXX - XXX -->
 #errors
 #document
 | <!DOCTYPE html>
 | <!--  XXX - XXX - XXX  -->
 | <html>
 |   <head>
 |   <body>
 #data
 <isindex test=x name=x>
 #errors
 Line: 1 Col: 23 Unexpected start tag (isindex). Expected DOCTYPE.
 Line: 1 Col: 23 Unexpected start tag isindex. Don't use it!
 #document
 | <html>
 |   <head>
 |   <body>
 |     <form>
 |       <hr>
 |       <label>
 |         "This is a searchable index. Enter search keywords: "
 |         <input>
 |           name="isindex"
 |           test="x"
 |       <hr>
 #data
 test
 test
 #errors
 Line: 2 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "test
 test"
 #data
 <!DOCTYPE html><body><title>test</body></title>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <title>
 |       "test</body>"
 #data
 <!DOCTYPE html><body><title>X</title><meta name=z><link rel=foo><style>
 x { content:"</style" } </style>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <title>
 |       "X"
 |     <meta>
 |       name="z"
 |     <link>
 |       rel="foo"
 |     <style>
 |       "
 x { content:"</style" } "
 #data
 <!DOCTYPE html><select><optgroup></optgroup></select>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <optgroup>
 #data
 #errors
 Line: 2 Col: 1 Unexpected End of file. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html>  <html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><script>
 </script>  <title>x</title>  </head>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <script>
 |       "
 "
 |     "  "
 |     <title>
 |       "x"
 |     "  "
 |   <body>
 #data
 <!DOCTYPE html><html><body><html id=x>
 #errors
 Line: 1 Col: 38 html needs to be the first start tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   id="x"
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html>X</body><html id="x">
 #errors
 Line: 1 Col: 36 Unexpected start tag token (html) in the after body phase.
 Line: 1 Col: 36 html needs to be the first start tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   id="x"
 |   <head>
 |   <body>
 |     "X"
 #data
 <!DOCTYPE html><head><html id=x>
 #errors
 Line: 1 Col: 32 html needs to be the first start tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   id="x"
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html>X</html>X
 #errors
 Line: 1 Col: 24 Unexpected non-space characters in the after body phase.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "XX"
 #data
 <!DOCTYPE html>X</html> 
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X "
 #data
 <!DOCTYPE html>X</html><p>X
 #errors
 Line: 1 Col: 26 Unexpected start tag (p).
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X"
 |     <p>
 |       "X"
 #data
 <!DOCTYPE html>X<p/x/y/z>
 #errors
 Line: 1 Col: 19 Expected a > after the /.
 Line: 1 Col: 21 Solidus (/) incorrectly placed in tag.
 Line: 1 Col: 23 Solidus (/) incorrectly placed in tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X"
 |     <p>
 |       x=""
 |       y=""
 |       z=""
 #data
 <!DOCTYPE html><!--x--
 #errors
 Line: 1 Col: 22 Unexpected end of file in comment (--).
 #document
 | <!DOCTYPE html>
 | <!-- x -->
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE html><table><tr><td></p></table>
 #errors
 Line: 1 Col: 34 Unexpected end tag (p). Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <p>
 #data
 <!DOCTYPE <!DOCTYPE HTML>><!--<!--x-->-->
 #errors
 Line: 1 Col: 20 Expected space or '>'. Got ''
 Line: 1 Col: 25 Erroneous DOCTYPE.
 Line: 1 Col: 35 Unexpected character in comment found.
 #document
 | <!DOCTYPE <!doctype>
 | <html>
 |   <head>
 |   <body>
 |     ">"
 |     <!-- <!--x -->
 |     "-->"
 #data
 <!doctype html><div><form></form><div></div></div>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <form>
 |       <div>
--- a/lib/html5lib/tests/testdata/tree-construction/tests20.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests20.dat
@ -1,455 +0,0 @@
 #data
 <!doctype html><p><button><button>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |       <button>
 #data
 <!doctype html><p><button><address>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <address>
 #data
 <!doctype html><p><button><blockquote>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <blockquote>
 #data
 <!doctype html><p><button><menu>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <menu>
 #data
 <!doctype html><p><button><p>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <p>
 #data
 <!doctype html><p><button><ul>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <ul>
 #data
 <!doctype html><p><button><h1>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <h1>
 #data
 <!doctype html><p><button><h6>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <h6>
 #data
 <!doctype html><p><button><listing>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <listing>
 #data
 <!doctype html><p><button><pre>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <pre>
 #data
 <!doctype html><p><button><form>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <form>
 #data
 <!doctype html><p><button><li>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <li>
 #data
 <!doctype html><p><button><dd>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <dd>
 #data
 <!doctype html><p><button><dt>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <dt>
 #data
 <!doctype html><p><button><plaintext>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <plaintext>
 #data
 <!doctype html><p><button><table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <table>
 #data
 <!doctype html><p><button><hr>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <hr>
 #data
 <!doctype html><p><button><xmp>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <xmp>
 #data
 <!doctype html><p><button></p>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <button>
 |         <p>
 #data
 <!doctype html><address><button></address>a
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <address>
 |       <button>
 |     "a"
 #data
 <!doctype html><address><button></address>a
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <address>
 |       <button>
 |     "a"
 #data
 <p><table></p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <p>
 |       <table>
 #data
 <!doctype html><svg>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <!doctype html><p><figcaption>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <figcaption>
 #data
 <!doctype html><p><summary>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <summary>
 #data
 <!doctype html><form><table><form>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <form>
 |       <table>
 #data
 <!doctype html><table><form><form>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <form>
 #data
 <!doctype html><table><form></table><form>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <form>
 #data
 <!doctype html><svg><foreignObject><p>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg foreignObject>
 |         <p>
 #data
 <!doctype html><svg><title>abc
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg title>
 |         "abc"
 #data
 <option><span><option>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <option>
 |       <span>
 |         <option>
 #data
 <option><option>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <option>
 |     <option>
 #data
 <math><annotation-xml><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |     <div>
 #data
 <math><annotation-xml encoding="application/svg+xml"><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         encoding="application/svg+xml"
 |     <div>
 #data
 <math><annotation-xml encoding="application/xhtml+xml"><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         encoding="application/xhtml+xml"
 |         <div>
 #data
 <math><annotation-xml encoding="aPPlication/xhtmL+xMl"><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         encoding="aPPlication/xhtmL+xMl"
 |         <div>
 #data
 <math><annotation-xml encoding="text/html"><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         encoding="text/html"
 |         <div>
 #data
 <math><annotation-xml encoding="Text/htmL"><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         encoding="Text/htmL"
 |         <div>
 #data
 <math><annotation-xml encoding=" text/html "><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         encoding=" text/html "
 |     <div>
--- a/lib/html5lib/tests/testdata/tree-construction/tests21.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests21.dat
@ -1,221 +0,0 @@
 #data
 <svg><![CDATA[foo]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "foo"
 #data
 <math><![CDATA[foo]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       "foo"
 #data
 <div><![CDATA[foo]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <!-- [CDATA[foo]] -->
 #data
 <svg><![CDATA[foo
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "foo"
 #data
 <svg><![CDATA[foo
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "foo"
 #data
 <svg><![CDATA[
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <svg><![CDATA[]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 #data
 <svg><![CDATA[]] >]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "]] >"
 #data
 <svg><![CDATA[]] >]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "]] >"
 #data
 <svg><![CDATA[]]
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "]]"
 #data
 <svg><![CDATA[]
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "]"
 #data
 <svg><![CDATA[]>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "]>a"
 #data
 <svg><foreignObject><div><![CDATA[foo]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg foreignObject>
 |         <div>
 |           <!-- [CDATA[foo]] -->
 #data
 <svg><![CDATA[<svg>]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<svg>"
 #data
 <svg><![CDATA[</svg>a]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "</svg>a"
 #data
 <svg><![CDATA[<svg>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<svg>a"
 #data
 <svg><![CDATA[</svg>a
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "</svg>a"
 #data
 <svg><![CDATA[<svg>]]><path>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<svg>"
 |       <svg path>
 #data
 <svg><![CDATA[<svg>]]></path>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<svg>"
 #data
 <svg><![CDATA[<svg>]]><!--path-->
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<svg>"
 |       <!-- path -->
 #data
 <svg><![CDATA[<svg>]]>path
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<svg>path"
 #data
 <svg><![CDATA[<!--svg-->]]>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       "<!--svg-->"
--- a/lib/html5lib/tests/testdata/tree-construction/tests22.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests22.dat
@ -1,157 +0,0 @@
 #data
 <a><b><big><em><strong><div>X</a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       <b>
 |         <big>
 |           <em>
 |             <strong>
 |     <big>
 |       <em>
 |         <strong>
 |           <div>
 |             <a>
 |               "X"
 #data
 <a><b><div id=1><div id=2><div id=3><div id=4><div id=5><div id=6><div id=7><div id=8>A</a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       <b>
 |     <b>
 |       <div>
 |         id="1"
 |         <a>
 |         <div>
 |           id="2"
 |           <a>
 |           <div>
 |             id="3"
 |             <a>
 |             <div>
 |               id="4"
 |               <a>
 |               <div>
 |                 id="5"
 |                 <a>
 |                 <div>
 |                   id="6"
 |                   <a>
 |                   <div>
 |                     id="7"
 |                     <a>
 |                     <div>
 |                       id="8"
 |                       <a>
 |                         "A"
 #data
 <a><b><div id=1><div id=2><div id=3><div id=4><div id=5><div id=6><div id=7><div id=8><div id=9>A</a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       <b>
 |     <b>
 |       <div>
 |         id="1"
 |         <a>
 |         <div>
 |           id="2"
 |           <a>
 |           <div>
 |             id="3"
 |             <a>
 |             <div>
 |               id="4"
 |               <a>
 |               <div>
 |                 id="5"
 |                 <a>
 |                 <div>
 |                   id="6"
 |                   <a>
 |                   <div>
 |                     id="7"
 |                     <a>
 |                     <div>
 |                       id="8"
 |                       <a>
 |                         <div>
 |                           id="9"
 |                           "A"
 #data
 <a><b><div id=1><div id=2><div id=3><div id=4><div id=5><div id=6><div id=7><div id=8><div id=9><div id=10>A</a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       <b>
 |     <b>
 |       <div>
 |         id="1"
 |         <a>
 |         <div>
 |           id="2"
 |           <a>
 |           <div>
 |             id="3"
 |             <a>
 |             <div>
 |               id="4"
 |               <a>
 |               <div>
 |                 id="5"
 |                 <a>
 |                 <div>
 |                   id="6"
 |                   <a>
 |                   <div>
 |                     id="7"
 |                     <a>
 |                     <div>
 |                       id="8"
 |                       <a>
 |                         <div>
 |                           id="9"
 |                           <div>
 |                             id="10"
 |                             "A"
 #data
 <cite><b><cite><i><cite><i><cite><i><div>X</b>TEST
 #errors
 Line: 1 Col: 6 Unexpected start tag (cite). Expected DOCTYPE.
 Line: 1 Col: 46 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm.
 Line: 1 Col: 50 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <cite>
 |       <b>
 |         <cite>
 |           <i>
 |             <cite>
 |               <i>
 |                 <cite>
 |                   <i>
 |       <i>
 |         <i>
 |           <div>
 |             <b>
 |               "X"
 |             "TEST"
--- a/lib/html5lib/tests/testdata/tree-construction/tests23.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests23.dat
@ -1,155 +0,0 @@
 #data
 <p><font size=4><font color=red><font size=4><font size=4><font size=4><font size=4><font size=4><font color=red><p>X
 #errors
 3: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
 116: Unclosed elements.
 117: End of file seen and there were open elements.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <font>
 |         size="4"
 |         <font>
 |           color="red"
 |           <font>
 |             size="4"
 |             <font>
 |               size="4"
 |               <font>
 |                 size="4"
 |                 <font>
 |                   size="4"
 |                   <font>
 |                     size="4"
 |                     <font>
 |                       color="red"
 |     <p>
 |       <font>
 |         color="red"
 |         <font>
 |           size="4"
 |           <font>
 |             size="4"
 |             <font>
 |               size="4"
 |               <font>
 |                 color="red"
 |                 "X"
 #data
 <p><font size=4><font size=4><font size=4><font size=4><p>X
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <font>
 |         size="4"
 |         <font>
 |           size="4"
 |           <font>
 |             size="4"
 |             <font>
 |               size="4"
 |     <p>
 |       <font>
 |         size="4"
 |         <font>
 |           size="4"
 |           <font>
 |             size="4"
 |             "X"
 #data
 <p><font size=4><font size=4><font size=4><font size="5"><font size=4><p>X
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <font>
 |         size="4"
 |         <font>
 |           size="4"
 |           <font>
 |             size="4"
 |             <font>
 |               size="5"
 |               <font>
 |                 size="4"
 |     <p>
 |       <font>
 |         size="4"
 |         <font>
 |           size="4"
 |           <font>
 |             size="5"
 |             <font>
 |               size="4"
 |               "X"
 #data
 <p><font size=4 id=a><font size=4 id=b><font size=4><font size=4><p>X
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <font>
 |         id="a"
 |         size="4"
 |         <font>
 |           id="b"
 |           size="4"
 |           <font>
 |             size="4"
 |             <font>
 |               size="4"
 |     <p>
 |       <font>
 |         id="a"
 |         size="4"
 |         <font>
 |           id="b"
 |           size="4"
 |           <font>
 |             size="4"
 |             <font>
 |               size="4"
 |               "X"
 #data
 <p><b id=a><b id=a><b id=a><b><object><b id=a><b id=a>X</object><p>Y
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <b>
 |         id="a"
 |         <b>
 |           id="a"
 |           <b>
 |             id="a"
 |             <b>
 |               <object>
 |                 <b>
 |                   id="a"
 |                   <b>
 |                     id="a"
 |                     "X"
 |     <p>
 |       <b>
 |         id="a"
 |         <b>
 |           id="a"
 |           <b>
 |             id="a"
 |             <b>
 |               "Y"
--- a/lib/html5lib/tests/testdata/tree-construction/tests24.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests24.dat
@ -1,79 +0,0 @@
 #data
 <!DOCTYPE html>&NotEqualTilde;
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "≂̸"
 #data
 <!DOCTYPE html>&NotEqualTilde;A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "≂̸A"
 #data
 <!DOCTYPE html>&ThickSpace;
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "  "
 #data
 <!DOCTYPE html>&ThickSpace;A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "  A"
 #data
 <!DOCTYPE html>&NotSubset;
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "⊂⃒"
 #data
 <!DOCTYPE html>&NotSubset;A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "⊂⃒A"
 #data
 <!DOCTYPE html>&Gopf;
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "𝔾"
 #data
 <!DOCTYPE html>&Gopf;A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "𝔾A"
--- a/lib/html5lib/tests/testdata/tree-construction/tests25.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests25.dat
@ -1,219 +0,0 @@
 #data
 <!DOCTYPE html><body><foo>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |       "A"
 #data
 <!DOCTYPE html><body><area>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <area>
 |     "A"
 #data
 <!DOCTYPE html><body><base>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <base>
 |     "A"
 #data
 <!DOCTYPE html><body><basefont>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <basefont>
 |     "A"
 #data
 <!DOCTYPE html><body><bgsound>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <bgsound>
 |     "A"
 #data
 <!DOCTYPE html><body><br>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <br>
 |     "A"
 #data
 <!DOCTYPE html><body><col>A
 #errors
 26: Stray start tag “col”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "A"
 #data
 <!DOCTYPE html><body><command>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <command>
 |     "A"
 #data
 <!DOCTYPE html><body><embed>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <embed>
 |     "A"
 #data
 <!DOCTYPE html><body><frame>A
 #errors
 26: Stray start tag “frame”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "A"
 #data
 <!DOCTYPE html><body><hr>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <hr>
 |     "A"
 #data
 <!DOCTYPE html><body><img>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <img>
 |     "A"
 #data
 <!DOCTYPE html><body><input>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <input>
 |     "A"
 #data
 <!DOCTYPE html><body><keygen>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <keygen>
 |     "A"
 #data
 <!DOCTYPE html><body><link>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <link>
 |     "A"
 #data
 <!DOCTYPE html><body><meta>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <meta>
 |     "A"
 #data
 <!DOCTYPE html><body><param>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <param>
 |     "A"
 #data
 <!DOCTYPE html><body><source>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <source>
 |     "A"
 #data
 <!DOCTYPE html><body><track>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <track>
 |     "A"
 #data
 <!DOCTYPE html><body><wbr>A
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <wbr>
 |     "A"
--- a/lib/html5lib/tests/testdata/tree-construction/tests26.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests26.dat
@ -1,313 +0,0 @@
 #data
 <!DOCTYPE html><body><a href='#1'><nobr>1<nobr></a><br><a href='#2'><nobr>2<nobr></a><br><a href='#3'><nobr>3<nobr></a>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       href="#1"
 |       <nobr>
 |         "1"
 |       <nobr>
 |     <nobr>
 |       <br>
 |       <a>
 |         href="#2"
 |     <a>
 |       href="#2"
 |       <nobr>
 |         "2"
 |       <nobr>
 |     <nobr>
 |       <br>
 |       <a>
 |         href="#3"
 |     <a>
 |       href="#3"
 |       <nobr>
 |         "3"
 |       <nobr>
 #data
 <!DOCTYPE html><body><b><nobr>1<nobr></b><i><nobr>2<nobr></i>3
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |       <nobr>
 |     <nobr>
 |       <i>
 |     <i>
 |       <nobr>
 |         "2"
 |       <nobr>
 |     <nobr>
 |       "3"
 #data
 <!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |         <nobr>
 |           <i>
 |         <i>
 |           <nobr>
 |             "2"
 |           <nobr>
 |         <nobr>
 |           "3"
 |         <table>
 #data
 <!DOCTYPE html><body><b><nobr>1<table><tr><td><nobr></b><i><nobr>2<nobr></i>3
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |         <table>
 |           <tbody>
 |             <tr>
 |               <td>
 |                 <nobr>
 |                   <i>
 |                 <i>
 |                   <nobr>
 |                     "2"
 |                   <nobr>
 |                 <nobr>
 |                   "3"
 #data
 <!DOCTYPE html><body><b><nobr>1<div><nobr></b><i><nobr>2<nobr></i>3
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |     <div>
 |       <b>
 |         <nobr>
 |         <nobr>
 |       <nobr>
 |         <i>
 |       <i>
 |         <nobr>
 |           "2"
 |         <nobr>
 |       <nobr>
 |         "3"
 #data
 <!DOCTYPE html><body><b><nobr>1<nobr></b><div><i><nobr>2<nobr></i>3
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |       <nobr>
 |     <div>
 |       <nobr>
 |         <i>
 |       <i>
 |         <nobr>
 |           "2"
 |         <nobr>
 |       <nobr>
 |         "3"
 #data
 <!DOCTYPE html><body><b><nobr>1<nobr><ins></b><i><nobr>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |       <nobr>
 |         <ins>
 |     <nobr>
 |       <i>
 |     <i>
 |       <nobr>
 #data
 <!DOCTYPE html><body><b><nobr>1<ins><nobr></b><i>2
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       <nobr>
 |         "1"
 |         <ins>
 |       <nobr>
 |     <nobr>
 |       <i>
 |         "2"
 #data
 <!DOCTYPE html><body><b>1<nobr></b><i><nobr>2</i>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       "1"
 |       <nobr>
 |     <nobr>
 |       <i>
 |     <i>
 |       <nobr>
 |         "2"
 #data
 <p><code x</code></p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <code>
 |         code=""
 |         x<=""
 |     <code>
 |       code=""
 |       x<=""
 |       "
 "
 #data
 <!DOCTYPE html><svg><foreignObject><p><i></p>a
 #errors
 45: End tag “p” seen, but there were open elements.
 41: Unclosed element “i”.
 46: End of file seen and there were open elements.
 35: Unclosed element “foreignObject”.
 20: Unclosed element “svg”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg foreignObject>
 |         <p>
 |           <i>
 |         <i>
 |           "a"
 #data
 <!DOCTYPE html><table><tr><td><svg><foreignObject><p><i></p>a
 #errors
 56: End tag “p” seen, but there were open elements.
 52: Unclosed element “i”.
 57: End of file seen and there were open elements.
 46: Unclosed element “foreignObject”.
 31: Unclosed element “svg”.
 22: Unclosed element “table”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <svg svg>
 |               <svg foreignObject>
 |                 <p>
 |                   <i>
 |                 <i>
 |                   "a"
 #data
 <!DOCTYPE html><math><mtext><p><i></p>a
 #errors
 38: End tag “p” seen, but there were open elements.
 34: Unclosed element “i”.
 39: End of file in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mtext>
 |         <p>
 |           <i>
 |         <i>
 |           "a"
 #data
 <!DOCTYPE html><table><tr><td><math><mtext><p><i></p>a
 #errors
 53: End tag “p” seen, but there were open elements.
 49: Unclosed element “i”.
 54: End of file in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <math math>
 |               <math mtext>
 |                 <p>
 |                   <i>
 |                 <i>
 |                   "a"
 #data
 <!DOCTYPE html><body><div><!/div>a
 #errors
 29: Bogus comment.
 34: End of file seen and there were open elements.
 26: Unclosed element “div”.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <!-- /div -->
 |       "a"
--- a/lib/html5lib/tests/testdata/tree-construction/tests3.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests3.dat
@ -1,305 +0,0 @@
 #data
 <head></head><style></style>
 #errors
 Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
 Line: 1 Col: 20 Unexpected start tag (style) that can be in head. Moved.
 #document
 | <html>
 |   <head>
 |     <style>
 |   <body>
 #data
 <head></head><script></script>
 #errors
 Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
 Line: 1 Col: 21 Unexpected start tag (script) that can be in head. Moved.
 #document
 | <html>
 |   <head>
 |     <script>
 |   <body>
 #data
 <head></head><!-- --><style></style><!-- --><script></script>
 #errors
 Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
 Line: 1 Col: 28 Unexpected start tag (style) that can be in head. Moved.
 #document
 | <html>
 |   <head>
 |     <style>
 |     <script>
 |   <!--   -->
 |   <!--   -->
 |   <body>
 #data
 <head></head><!-- -->x<style></style><!-- --><script></script>
 #errors
 Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <!--   -->
 |   <body>
 |     "x"
 |     <style>
 |     <!--   -->
 |     <script>
 #data
 <!DOCTYPE html><html><head></head><body><pre>
 </pre></body></html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 #data
 <!DOCTYPE html><html><head></head><body><pre>
 foo</pre></body></html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "foo"
 #data
 <!DOCTYPE html><html><head></head><body><pre>
 foo</pre></body></html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "
 foo"
 #data
 <!DOCTYPE html><html><head></head><body><pre>
 foo
 </pre></body></html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "foo
 "
 #data
 <!DOCTYPE html><html><head></head><body><pre>x</pre><span>
 </span></body></html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "x"
 |     <span>
 |       "
 "
 #data
 <!DOCTYPE html><html><head></head><body><pre>x
 y</pre></body></html>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "x
 y"
 #data
 <!DOCTYPE html><html><head></head><body><pre>x<div>
 y</pre></body></html>
 #errors
 Line: 2 Col: 7 End tag (pre) seen too early. Expected other end tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "x"
 |       <div>
 |         "
 y"
 #data
 <!DOCTYPE html><pre>&#x0a;&#x0a;A</pre>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <pre>
 |       "
 A"
 #data
 <!DOCTYPE html><HTML><META><HEAD></HEAD></HTML>
 #errors
 Line: 1 Col: 33 Unexpected start tag head in existing head. Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <meta>
 |   <body>
 #data
 <!DOCTYPE html><HTML><HEAD><head></HEAD></HTML>
 #errors
 Line: 1 Col: 33 Unexpected start tag head in existing head. Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 <textarea>foo<span>bar</span><i>baz
 #errors
 Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE.
 Line: 1 Col: 35 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       "foo<span>bar</span><i>baz"
 #data
 <title>foo<span>bar</em><i>baz
 #errors
 Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 Line: 1 Col: 30 Unexpected end of file. Expected end tag (title).
 #document
 | <html>
 |   <head>
 |     <title>
 |       "foo<span>bar</em><i>baz"
 |   <body>
 #data
 <!DOCTYPE html><textarea>
 </textarea>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 #data
 <!DOCTYPE html><textarea>
 foo</textarea>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       "foo"
 #data
 <!DOCTYPE html><textarea>
 foo</textarea>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       "
 foo"
 #data
 <!DOCTYPE html><html><head></head><body><ul><li><div><p><li></ul></body></html>
 #errors
 Line: 1 Col: 60 Missing end tag (div, li).
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <ul>
 |       <li>
 |         <div>
 |           <p>
 |       <li>
 #data
 <!doctype html><nobr><nobr><nobr>
 #errors
 Line: 1 Col: 27 Unexpected start tag (nobr) implies end tag (nobr).
 Line: 1 Col: 33 Unexpected start tag (nobr) implies end tag (nobr).
 Line: 1 Col: 33 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <nobr>
 |     <nobr>
 |     <nobr>
 #data
 <!doctype html><nobr><nobr></nobr><nobr>
 #errors
 Line: 1 Col: 27 Unexpected start tag (nobr) implies end tag (nobr).
 Line: 1 Col: 40 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <nobr>
 |     <nobr>
 |     <nobr>
 #data
 <!doctype html><html><body><p><table></table></body></html>
 #errors
 Not known
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <table>
 #data
 <p><table></table>
 #errors
 Not known
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <table>
--- a/lib/html5lib/tests/testdata/tree-construction/tests4.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests4.dat
@ -1,59 +0,0 @@
 #data
 direct div content
 #errors
 #document-fragment
 div
 #document
 | "direct div content"
 #data
 direct textarea content
 #errors
 #document-fragment
 textarea
 #document
 | "direct textarea content"
 #data
 textarea content with <em>pseudo</em> <foo>markup
 #errors
 #document-fragment
 textarea
 #document
 | "textarea content with <em>pseudo</em> <foo>markup"
 #data
 this is &#x0043;DATA inside a <style> element
 #errors
 #document-fragment
 style
 #document
 | "this is &#x0043;DATA inside a <style> element"
 #data
 </plaintext>
 #errors
 #document-fragment
 plaintext
 #document
 | "</plaintext>"
 #data
 setting html's innerHTML
 #errors
 Line: 1 Col: 24 Unexpected EOF in inner html mode.
 #document-fragment
 html
 #document
 | <head>
 | <body>
 |   "setting html's innerHTML"
 #data
 <title>setting head's innerHTML</title>
 #errors
 #document-fragment
 head
 #document
 | <title>
 |   "setting head's innerHTML"
--- a/lib/html5lib/tests/testdata/tree-construction/tests5.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests5.dat
@ -1,191 +0,0 @@
 #data
 <style> <!-- </style>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 Line: 1 Col: 22 Unexpected end of file. Expected end tag (style).
 #document
 | <html>
 |   <head>
 |     <style>
 |       " <!-- "
 |   <body>
 |     "x"
 #data
 <style> <!-- </style> --> </style>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <style>
 |       " <!-- "
 |     " "
 |   <body>
 |     "--> x"
 #data
 <style> <!--> </style>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <style>
 |       " <!--> "
 |   <body>
 |     "x"
 #data
 <style> <!---> </style>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <style>
 |       " <!---> "
 |   <body>
 |     "x"
 #data
 <iframe> <!---> </iframe>x
 #errors
 Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <iframe>
 |       " <!---> "
 |     "x"
 #data
 <iframe> <!--- </iframe>->x</iframe> --> </iframe>x
 #errors
 Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <iframe>
 |       " <!--- "
 |     "->x --> x"
 #data
 <script> <!-- </script> --> </script>x
 #errors
 Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <script>
 |       " <!-- "
 |     " "
 |   <body>
 |     "--> x"
 #data
 <title> <!-- </title> --> </title>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <title>
 |       " <!-- "
 |     " "
 |   <body>
 |     "--> x"
 #data
 <textarea> <!--- </textarea>->x</textarea> --> </textarea>x
 #errors
 Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <textarea>
 |       " <!--- "
 |     "->x --> x"
 #data
 <style> <!</-- </style>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <style>
 |       " <!</-- "
 |   <body>
 |     "x"
 #data
 <p><xmp></xmp>
 #errors
 XXX: Unknown
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |     <xmp>
 #data
 <xmp> <!-- > --> </xmp>
 #errors
 Line: 1 Col: 5 Unexpected start tag (xmp). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <xmp>
 |       " <!-- > --> "
 #data
 <title>&amp;</title>
 #errors
 Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <title>
 |       "&"
 |   <body>
 #data
 <title><!--&amp;--></title>
 #errors
 Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <title>
 |       "<!--&-->"
 |   <body>
 #data
 <title><!--</title>
 #errors
 Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE.
 Line: 1 Col: 19 Unexpected end of file. Expected end tag (title).
 #document
 | <html>
 |   <head>
 |     <title>
 |       "<!--"
 |   <body>
 #data
 <noscript><!--</noscript>--></noscript>
 #errors
 Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |     <noscript>
 |       "<!--"
 |   <body>
 |     "-->"
--- a/lib/html5lib/tests/testdata/tree-construction/tests6.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests6.dat
@ -1,663 +0,0 @@
 #data
 <!doctype html></head> <head>
 #errors
 Line: 1 Col: 29 Unexpected start tag head. Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   " "
 |   <body>
 #data
 <!doctype html><form><div></form><div>
 #errors
 33: End tag "form" seen but there were unclosed elements.
 38: End of file seen and there were open elements.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <form>
 |       <div>
 |         <div>
 #data
 <!doctype html><title>&amp;</title>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <title>
 |       "&"
 |   <body>
 #data
 <!doctype html><title><!--&amp;--></title>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <title>
 |       "<!--&-->"
 |   <body>
 #data
 <!doctype>
 #errors
 Line: 1 Col: 9 No space after literal string 'DOCTYPE'.
 Line: 1 Col: 10 Unexpected > character. Expected DOCTYPE name.
 Line: 1 Col: 10 Erroneous DOCTYPE.
 #document
 | <!DOCTYPE >
 | <html>
 |   <head>
 |   <body>
 #data
 <!---x
 #errors
 Line: 1 Col: 6 Unexpected end of file in comment.
 Line: 1 Col: 6 Unexpected End of file. Expected DOCTYPE.
 #document
 | <!-- -x -->
 | <html>
 |   <head>
 |   <body>
 #data
 <body>
 <div>
 #errors
 Line: 1 Col: 6 Unexpected start tag (body).
 Line: 2 Col: 5 Expected closing tag. Unexpected end of file.
 #document-fragment
 div
 #document
 | "
 "
 | <div>
 #data
 <frameset></frameset>
 foo
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 Line: 2 Col: 3 Unexpected non-space characters in the after frameset phase. Ignored.
 #document
 | <html>
 |   <head>
 |   <frameset>
 |   "
 "
 #data
 <frameset></frameset>
 <noframes>
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 Line: 2 Col: 10 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <frameset>
 |   "
 "
 |   <noframes>
 #data
 <frameset></frameset>
 <div>
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 Line: 2 Col: 5 Unexpected start tag (div) in the after frameset phase. Ignored.
 #document
 | <html>
 |   <head>
 |   <frameset>
 |   "
 "
 #data
 <frameset></frameset>
 </html>
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <frameset>
 |   "
 "
 #data
 <frameset></frameset>
 </div>
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 Line: 2 Col: 6 Unexpected end tag (div) in the after frameset phase. Ignored.
 #document
 | <html>
 |   <head>
 |   <frameset>
 |   "
 "
 #data
 <form><form>
 #errors
 Line: 1 Col: 6 Unexpected start tag (form). Expected DOCTYPE.
 Line: 1 Col: 12 Unexpected start tag (form).
 Line: 1 Col: 12 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <form>
 #data
 <button><button>
 #errors
 Line: 1 Col: 8 Unexpected start tag (button). Expected DOCTYPE.
 Line: 1 Col: 16 Unexpected start tag (button) implies end tag (button).
 Line: 1 Col: 16 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <button>
 |     <button>
 #data
 <table><tr><td></th>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 20 Unexpected end tag (th). Ignored.
 Line: 1 Col: 20 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <table><caption><td>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 20 Unexpected end tag (td). Ignored.
 Line: 1 Col: 20 Unexpected table cell start tag (td) in the table body phase.
 Line: 1 Col: 20 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <table><caption><div>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 21 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <div>
 #data
 </caption><div>
 #errors
 Line: 1 Col: 10 Unexpected end tag (caption). Ignored.
 Line: 1 Col: 15 Expected closing tag. Unexpected end of file.
 #document-fragment
 caption
 #document
 | <div>
 #data
 <table><caption><div></caption>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 31 Unexpected end tag (caption). Missing end tag (div).
 Line: 1 Col: 31 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <div>
 #data
 <table><caption></table>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 24 Unexpected end table tag in caption. Generates implied end caption.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 #data
 </table><div>
 #errors
 Line: 1 Col: 8 Unexpected end table tag in caption. Generates implied end caption.
 Line: 1 Col: 8 Unexpected end tag (caption). Ignored.
 Line: 1 Col: 13 Expected closing tag. Unexpected end of file.
 #document-fragment
 caption
 #document
 | <div>
 #data
 <table><caption></body></col></colgroup></html></tbody></td></tfoot></th></thead></tr>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 23 Unexpected end tag (body). Ignored.
 Line: 1 Col: 29 Unexpected end tag (col). Ignored.
 Line: 1 Col: 40 Unexpected end tag (colgroup). Ignored.
 Line: 1 Col: 47 Unexpected end tag (html). Ignored.
 Line: 1 Col: 55 Unexpected end tag (tbody). Ignored.
 Line: 1 Col: 60 Unexpected end tag (td). Ignored.
 Line: 1 Col: 68 Unexpected end tag (tfoot). Ignored.
 Line: 1 Col: 73 Unexpected end tag (th). Ignored.
 Line: 1 Col: 81 Unexpected end tag (thead). Ignored.
 Line: 1 Col: 86 Unexpected end tag (tr). Ignored.
 Line: 1 Col: 86 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 #data
 <table><caption><div></div>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 27 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <div>
 #data
 <table><tr><td></body></caption></col></colgroup></html>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 22 Unexpected end tag (body). Ignored.
 Line: 1 Col: 32 Unexpected end tag (caption). Ignored.
 Line: 1 Col: 38 Unexpected end tag (col). Ignored.
 Line: 1 Col: 49 Unexpected end tag (colgroup). Ignored.
 Line: 1 Col: 56 Unexpected end tag (html). Ignored.
 Line: 1 Col: 56 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 </table></tbody></tfoot></thead></tr><div>
 #errors
 Line: 1 Col: 8 Unexpected end tag (table). Ignored.
 Line: 1 Col: 16 Unexpected end tag (tbody). Ignored.
 Line: 1 Col: 24 Unexpected end tag (tfoot). Ignored.
 Line: 1 Col: 32 Unexpected end tag (thead). Ignored.
 Line: 1 Col: 37 Unexpected end tag (tr). Ignored.
 Line: 1 Col: 42 Expected closing tag. Unexpected end of file.
 #document-fragment
 td
 #document
 | <div>
 #data
 <table><colgroup>foo
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 20 Unexpected non-space characters in table context caused voodoo mode.
 Line: 1 Col: 20 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "foo"
 |     <table>
 |       <colgroup>
 #data
 foo<col>
 #errors
 Line: 1 Col: 3 Unexpected end tag (colgroup). Ignored.
 #document-fragment
 colgroup
 #document
 | <col>
 #data
 <table><colgroup></col>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 23 This element (col) has no end tag.
 Line: 1 Col: 23 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <colgroup>
 #data
 <frameset><div>
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 Line: 1 Col: 15 Unexpected start tag token (div) in the frameset phase. Ignored.
 Line: 1 Col: 15 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 </frameset><frame>
 #errors
 Line: 1 Col: 11 Unexpected end tag token (frameset) in the frameset phase (innerHTML).
 #document-fragment
 frameset
 #document
 | <frame>
 #data
 <frameset></div>
 #errors
 Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE.
 Line: 1 Col: 16 Unexpected end tag token (div) in the frameset phase. Ignored.
 Line: 1 Col: 16 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 </body><div>
 #errors
 Line: 1 Col: 7 Unexpected end tag (body). Ignored.
 Line: 1 Col: 12 Expected closing tag. Unexpected end of file.
 #document-fragment
 body
 #document
 | <div>
 #data
 <table><tr><div>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 16 Unexpected start tag (div) in table context caused voodoo mode.
 Line: 1 Col: 16 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 </tr><td>
 #errors
 Line: 1 Col: 5 Unexpected end tag (tr). Ignored.
 #document-fragment
 tr
 #document
 | <td>
 #data
 </tbody></tfoot></thead><td>
 #errors
 Line: 1 Col: 8 Unexpected end tag (tbody). Ignored.
 Line: 1 Col: 16 Unexpected end tag (tfoot). Ignored.
 Line: 1 Col: 24 Unexpected end tag (thead). Ignored.
 #document-fragment
 tr
 #document
 | <td>
 #data
 <table><tr><div><td>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 16 Unexpected start tag (div) in table context caused voodoo mode.
 Line: 1 Col: 20 Unexpected implied end tag (div) in the table row phase.
 Line: 1 Col: 20 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <caption><col><colgroup><tbody><tfoot><thead><tr>
 #errors
 Line: 1 Col: 9 Unexpected start tag (caption).
 Line: 1 Col: 14 Unexpected start tag (col).
 Line: 1 Col: 24 Unexpected start tag (colgroup).
 Line: 1 Col: 31 Unexpected start tag (tbody).
 Line: 1 Col: 38 Unexpected start tag (tfoot).
 Line: 1 Col: 45 Unexpected start tag (thead).
 Line: 1 Col: 49 Unexpected end of file. Expected table content.
 #document-fragment
 tbody
 #document
 | <tr>
 #data
 <table><tbody></thead>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 22 Unexpected end tag (thead) in the table body phase. Ignored.
 Line: 1 Col: 22 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 #data
 </table><tr>
 #errors
 Line: 1 Col: 8 Unexpected end tag (table). Ignored.
 Line: 1 Col: 12 Unexpected end of file. Expected table content.
 #document-fragment
 tbody
 #document
 | <tr>
 #data
 <table><tbody></body></caption></col></colgroup></html></td></th></tr>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 21 Unexpected end tag (body) in the table body phase. Ignored.
 Line: 1 Col: 31 Unexpected end tag (caption) in the table body phase. Ignored.
 Line: 1 Col: 37 Unexpected end tag (col) in the table body phase. Ignored.
 Line: 1 Col: 48 Unexpected end tag (colgroup) in the table body phase. Ignored.
 Line: 1 Col: 55 Unexpected end tag (html) in the table body phase. Ignored.
 Line: 1 Col: 60 Unexpected end tag (td) in the table body phase. Ignored.
 Line: 1 Col: 65 Unexpected end tag (th) in the table body phase. Ignored.
 Line: 1 Col: 70 Unexpected end tag (tr) in the table body phase. Ignored.
 Line: 1 Col: 70 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 #data
 <table><tbody></div>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 20 Unexpected end tag (div) in table context caused voodoo mode.
 Line: 1 Col: 20 End tag (div) seen too early. Expected other end tag.
 Line: 1 Col: 20 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 #data
 <table><table>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 14 Unexpected start tag (table) implies end tag (table).
 Line: 1 Col: 14 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |     <table>
 #data
 <table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 14 Unexpected end tag (body). Ignored.
 Line: 1 Col: 24 Unexpected end tag (caption). Ignored.
 Line: 1 Col: 30 Unexpected end tag (col). Ignored.
 Line: 1 Col: 41 Unexpected end tag (colgroup). Ignored.
 Line: 1 Col: 48 Unexpected end tag (html). Ignored.
 Line: 1 Col: 56 Unexpected end tag (tbody). Ignored.
 Line: 1 Col: 61 Unexpected end tag (td). Ignored.
 Line: 1 Col: 69 Unexpected end tag (tfoot). Ignored.
 Line: 1 Col: 74 Unexpected end tag (th). Ignored.
 Line: 1 Col: 82 Unexpected end tag (thead). Ignored.
 Line: 1 Col: 87 Unexpected end tag (tr). Ignored.
 Line: 1 Col: 87 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 #data
 </table><tr>
 #errors
 Line: 1 Col: 8 Unexpected end tag (table). Ignored.
 Line: 1 Col: 12 Unexpected end of file. Expected table content.
 #document-fragment
 table
 #document
 | <tbody>
 |   <tr>
 #data
 <body></body></html>
 #errors
 Line: 1 Col: 20 Unexpected html end tag in inner html mode.
 Line: 1 Col: 20 Unexpected EOF in inner html mode.
 #document-fragment
 html
 #document
 | <head>
 | <body>
 #data
 <html><frameset></frameset></html> 
 #errors
 Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <frameset>
 |   " "
 #data
 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"><html></html>
 #errors
 Line: 1 Col: 50 Erroneous DOCTYPE.
 Line: 1 Col: 63 Unexpected end tag (html) after the (implied) root element.
 #document
 | <!DOCTYPE html "-//W3C//DTD HTML 4.01//EN" "">
 | <html>
 |   <head>
 |   <body>
 #data
 <param><frameset></frameset>
 #errors
 Line: 1 Col: 7 Unexpected start tag (param). Expected DOCTYPE.
 Line: 1 Col: 17 Unexpected start tag (frameset).
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 <source><frameset></frameset>
 #errors
 Line: 1 Col: 7 Unexpected start tag (source). Expected DOCTYPE.
 Line: 1 Col: 17 Unexpected start tag (frameset).
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 <track><frameset></frameset>
 #errors
 Line: 1 Col: 7 Unexpected start tag (track). Expected DOCTYPE.
 Line: 1 Col: 17 Unexpected start tag (frameset).
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 </html><frameset></frameset>
 #errors
 7: End tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
 17: Stray “frameset” start tag.
 17: “frameset” start tag seen.
 #document
 | <html>
 |   <head>
 |   <frameset>
 #data
 </body><frameset></frameset>
 #errors
 7: End tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”.
 17: Stray “frameset” start tag.
 17: “frameset” start tag seen.
 #document
 | <html>
 |   <head>
 |   <frameset>
--- a/lib/html5lib/tests/testdata/tree-construction/tests7.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests7.dat
@ -1,390 +0,0 @@
 #data
 <!doctype html><body><title>X</title>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <title>
 |       "X"
 #data
 <!doctype html><table><title>X</title></table>
 #errors
 Line: 1 Col: 29 Unexpected start tag (title) in table context caused voodoo mode.
 Line: 1 Col: 38 Unexpected end tag (title) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <title>
 |       "X"
 |     <table>
 #data
 <!doctype html><head></head><title>X</title>
 #errors
 Line: 1 Col: 35 Unexpected start tag (title) that can be in head. Moved.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <title>
 |       "X"
 |   <body>
 #data
 <!doctype html></head><title>X</title>
 #errors
 Line: 1 Col: 29 Unexpected start tag (title) that can be in head. Moved.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |     <title>
 |       "X"
 |   <body>
 #data
 <!doctype html><table><meta></table>
 #errors
 Line: 1 Col: 28 Unexpected start tag (meta) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <meta>
 |     <table>
 #data
 <!doctype html><table>X<tr><td><table> <meta></table></table>
 #errors
 Line: 1 Col: 23 Unexpected non-space characters in table context caused voodoo mode.
 Line: 1 Col: 45 Unexpected start tag (meta) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <meta>
 |             <table>
 |               " "
 #data
 <!doctype html><html> <head>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 <!doctype html> <head>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 <!doctype html><table><style> <tr>x </style> </table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <style>
 |         " <tr>x "
 |       " "
 #data
 <!doctype html><table><TBODY><script> <tr>x </script> </table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <script>
 |           " <tr>x "
 |         " "
 #data
 <!doctype html><p><applet><p>X</p></applet>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <applet>
 |         <p>
 |           "X"
 #data
 <!doctype html><listing>
 X</listing>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <listing>
 |       "X"
 #data
 <!doctype html><select><input>X
 #errors
 Line: 1 Col: 30 Unexpected input start tag in the select phase.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     <input>
 |     "X"
 #data
 <!doctype html><select><select>X
 #errors
 Line: 1 Col: 31 Unexpected select start tag in the select phase treated as select end tag.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     "X"
 #data
 <!doctype html><table><input type=hidDEN></table>
 #errors
 Line: 1 Col: 41 Unexpected input with type hidden in table context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <input>
 |         type="hidDEN"
 #data
 <!doctype html><table>X<input type=hidDEN></table>
 #errors
 Line: 1 Col: 23 Unexpected non-space characters in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     "X"
 |     <table>
 |       <input>
 |         type="hidDEN"
 #data
 <!doctype html><table>  <input type=hidDEN></table>
 #errors
 Line: 1 Col: 43 Unexpected input with type hidden in table context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       "  "
 |       <input>
 |         type="hidDEN"
 #data
 <!doctype html><table>  <input type='hidDEN'></table>
 #errors
 Line: 1 Col: 45 Unexpected input with type hidden in table context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       "  "
 |       <input>
 |         type="hidDEN"
 #data
 <!doctype html><table><input type=" hidden"><input type=hidDEN></table>
 #errors
 Line: 1 Col: 44 Unexpected start tag (input) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <input>
 |       type=" hidden"
 |     <table>
 |       <input>
 |         type="hidDEN"
 #data
 <!doctype html><table><select>X<tr>
 #errors
 Line: 1 Col: 30 Unexpected start tag (select) in table context caused voodoo mode.
 Line: 1 Col: 35 Unexpected table element start tag (trs) in the select in table phase.
 Line: 1 Col: 35 Unexpected end of file. Expected table content.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       "X"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!doctype html><select>X</select>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       "X"
 #data
 <!DOCTYPE hTmL><html></html>
 #errors
 Line: 1 Col: 28 Unexpected end tag (html) after the (implied) root element.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 <!DOCTYPE HTML><html></html>
 #errors
 Line: 1 Col: 28 Unexpected end tag (html) after the (implied) root element.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 #data
 <body>X</body></body>
 #errors
 Line: 1 Col: 21 Unexpected end tag token (body) in the after body phase.
 Line: 1 Col: 21 Unexpected EOF in inner html mode.
 #document-fragment
 html
 #document
 | <head>
 | <body>
 |   "X"
 #data
 <div><p>a</x> b
 #errors
 Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE.
 Line: 1 Col: 13 Unexpected end tag (x). Ignored.
 Line: 1 Col: 15 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <p>
 |         "a b"
 #data
 <table><tr><td><code></code> </table>
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <code>
 |             " "
 #data
 <table><b><tr><td>aaa</td></tr>bbb</table>ccc
 #errors
 XXX: Fix me
 #document
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |     <b>
 |       "bbb"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "aaa"
 |     <b>
 |       "ccc"
 #data
 A<table><tr> B</tr> B</table>
 #errors
 XXX: Fix me
 #document
 | <html>
 |   <head>
 |   <body>
 |     "A B B"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 A<table><tr> B</tr> </em>C</table>
 #errors
 XXX: Fix me
 #document
 | <html>
 |   <head>
 |   <body>
 |     "A BC"
 |     <table>
 |       <tbody>
 |         <tr>
 |         " "
 #data
 <select><keygen>
 #errors
 Not known
 #document
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |     <keygen>
--- a/lib/html5lib/tests/testdata/tree-construction/tests8.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests8.dat
@ -1,148 +0,0 @@
 #data
 <div>
 <div></div>
 </span>x
 #errors
 Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE.
 Line: 3 Col: 7 Unexpected end tag (span). Ignored.
 Line: 3 Col: 8 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "
 "
 |       <div>
 |       "
 x"
 #data
 <div>x<div></div>
 </span>x
 #errors
 Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE.
 Line: 2 Col: 7 Unexpected end tag (span). Ignored.
 Line: 2 Col: 8 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "x"
 |       <div>
 |       "
 x"
 #data
 <div>x<div></div>x</span>x
 #errors
 Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE.
 Line: 1 Col: 25 Unexpected end tag (span). Ignored.
 Line: 1 Col: 26 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "x"
 |       <div>
 |       "xx"
 #data
 <div>x<div></div>y</span>z
 #errors
 Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE.
 Line: 1 Col: 25 Unexpected end tag (span). Ignored.
 Line: 1 Col: 26 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "x"
 |       <div>
 |       "yz"
 #data
 <table><div>x<div></div>x</span>x
 #errors
 Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE.
 Line: 1 Col: 12 Unexpected start tag (div) in table context caused voodoo mode.
 Line: 1 Col: 18 Unexpected start tag (div) in table context caused voodoo mode.
 Line: 1 Col: 24 Unexpected end tag (div) in table context caused voodoo mode.
 Line: 1 Col: 32 Unexpected end tag (span) in table context caused voodoo mode.
 Line: 1 Col: 32 Unexpected end tag (span). Ignored.
 Line: 1 Col: 33 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "x"
 |       <div>
 |       "xx"
 |     <table>
 #data
 x<table>x
 #errors
 Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
 Line: 1 Col: 9 Unexpected non-space characters in table context caused voodoo mode.
 Line: 1 Col: 9 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "xx"
 |     <table>
 #data
 x<table><table>x
 #errors
 Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE.
 Line: 1 Col: 15 Unexpected start tag (table) implies end tag (table).
 Line: 1 Col: 16 Unexpected non-space characters in table context caused voodoo mode.
 Line: 1 Col: 16 Unexpected end of file. Expected table content.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "x"
 |     <table>
 |     "x"
 |     <table>
 #data
 <b>a<div></div><div></b>y
 #errors
 Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE.
 Line: 1 Col: 24 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm.
 Line: 1 Col: 25 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |       "a"
 |       <div>
 |     <div>
 |       <b>
 |       "y"
 #data
 <a><div><p></a>
 #errors
 Line: 1 Col: 3 Unexpected start tag (a). Expected DOCTYPE.
 Line: 1 Col: 15 End tag (a) violates step 1, paragraph 3 of the adoption agency algorithm.
 Line: 1 Col: 15 End tag (a) violates step 1, paragraph 3 of the adoption agency algorithm.
 Line: 1 Col: 15 Expected closing tag. Unexpected end of file.
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |     <div>
 |       <a>
 |       <p>
 |         <a>
--- a/lib/html5lib/tests/testdata/tree-construction/tests9.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests9.dat
@ -1,457 +0,0 @@
 #data
 <!DOCTYPE html><math></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 #data
 <!DOCTYPE html><body><math></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 #data
 <!DOCTYPE html><math><mi>
 #errors
 25: End of file in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 #data
 <!DOCTYPE html><math><annotation-xml><svg><u>
 #errors
 45: HTML start tag “u” in a foreign namespace context.
 45: End of file seen and there were open elements.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math annotation-xml>
 |         <svg svg>
 |     <u>
 #data
 <!DOCTYPE html><body><select><math></math></select>
 #errors
 Line: 1 Col: 35 Unexpected start tag token (math) in the select phase. Ignored.
 Line: 1 Col: 42 Unexpected end tag (math) in the select phase. Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 #data
 <!DOCTYPE html><body><select><option><math></math></option></select>
 #errors
 Line: 1 Col: 43 Unexpected start tag token (math) in the select phase. Ignored.
 Line: 1 Col: 50 Unexpected end tag (math) in the select phase. Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <option>
 #data
 <!DOCTYPE html><body><table><math></math></table>
 #errors
 Line: 1 Col: 34 Unexpected start tag (math) in table context caused voodoo mode.
 Line: 1 Col: 41 Unexpected end tag (math) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |     <table>
 #data
 <!DOCTYPE html><body><table><math><mi>foo</mi></math></table>
 #errors
 Line: 1 Col: 34 Unexpected start tag (math) in table context caused voodoo mode.
 Line: 1 Col: 46 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 53 Unexpected end tag (math) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |     <table>
 #data
 <!DOCTYPE html><body><table><math><mi>foo</mi><mi>bar</mi></math></table>
 #errors
 Line: 1 Col: 34 Unexpected start tag (math) in table context caused voodoo mode.
 Line: 1 Col: 46 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 58 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 65 Unexpected end tag (math) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |       <math mi>
 |         "bar"
 |     <table>
 #data
 <!DOCTYPE html><body><table><tbody><math><mi>foo</mi><mi>bar</mi></math></tbody></table>
 #errors
 Line: 1 Col: 41 Unexpected start tag (math) in table context caused voodoo mode.
 Line: 1 Col: 53 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 65 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 72 Unexpected end tag (math) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |       <math mi>
 |         "bar"
 |     <table>
 |       <tbody>
 #data
 <!DOCTYPE html><body><table><tbody><tr><math><mi>foo</mi><mi>bar</mi></math></tr></tbody></table>
 #errors
 Line: 1 Col: 45 Unexpected start tag (math) in table context caused voodoo mode.
 Line: 1 Col: 57 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 69 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 76 Unexpected end tag (math) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |       <math mi>
 |         "bar"
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <!DOCTYPE html><body><table><tbody><tr><td><math><mi>foo</mi><mi>bar</mi></math></td></tr></tbody></table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <math math>
 |               <math mi>
 |                 "foo"
 |               <math mi>
 |                 "bar"
 #data
 <!DOCTYPE html><body><table><tbody><tr><td><math><mi>foo</mi><mi>bar</mi></math><p>baz</td></tr></tbody></table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <math math>
 |               <math mi>
 |                 "foo"
 |               <math mi>
 |                 "bar"
 |             <p>
 |               "baz"
 #data
 <!DOCTYPE html><body><table><caption><math><mi>foo</mi><mi>bar</mi></math><p>baz</caption></table>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <math math>
 |           <math mi>
 |             "foo"
 |           <math mi>
 |             "bar"
 |         <p>
 |           "baz"
 #data
 <!DOCTYPE html><body><table><caption><math><mi>foo</mi><mi>bar</mi><p>baz</table><p>quux
 #errors
 Line: 1 Col: 70 HTML start tag "p" in a foreign namespace context.
 Line: 1 Col: 81 Unexpected end table tag in caption. Generates implied end caption.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <math math>
 |           <math mi>
 |             "foo"
 |           <math mi>
 |             "bar"
 |         <p>
 |           "baz"
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><caption><math><mi>foo</mi><mi>bar</mi>baz</table><p>quux
 #errors
 Line: 1 Col: 78 Unexpected end table tag in caption. Generates implied end caption.
 Line: 1 Col: 78 Unexpected end tag (caption). Missing end tag (math).
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <caption>
 |         <math math>
 |           <math mi>
 |             "foo"
 |           <math mi>
 |             "bar"
 |           "baz"
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><colgroup><math><mi>foo</mi><mi>bar</mi><p>baz</table><p>quux
 #errors
 Line: 1 Col: 44 Unexpected start tag (math) in table context caused voodoo mode.
 Line: 1 Col: 56 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 68 Unexpected end tag (mi) in table context caused voodoo mode.
 Line: 1 Col: 71 HTML start tag "p" in a foreign namespace context.
 Line: 1 Col: 71 Unexpected start tag (p) in table context caused voodoo mode.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |       <math mi>
 |         "bar"
 |     <p>
 |       "baz"
 |     <table>
 |       <colgroup>
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><tr><td><select><math><mi>foo</mi><mi>bar</mi><p>baz</table><p>quux
 #errors
 Line: 1 Col: 50 Unexpected start tag token (math) in the select phase. Ignored.
 Line: 1 Col: 54 Unexpected start tag token (mi) in the select phase. Ignored.
 Line: 1 Col: 62 Unexpected end tag (mi) in the select phase. Ignored.
 Line: 1 Col: 66 Unexpected start tag token (mi) in the select phase. Ignored.
 Line: 1 Col: 74 Unexpected end tag (mi) in the select phase. Ignored.
 Line: 1 Col: 77 Unexpected start tag token (p) in the select phase. Ignored.
 Line: 1 Col: 88 Unexpected table element end tag (tables) in the select in table phase.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             <select>
 |               "foobarbaz"
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body><table><select><math><mi>foo</mi><mi>bar</mi><p>baz</table><p>quux
 #errors
 Line: 1 Col: 36 Unexpected start tag (select) in table context caused voodoo mode.
 Line: 1 Col: 42 Unexpected start tag token (math) in the select phase. Ignored.
 Line: 1 Col: 46 Unexpected start tag token (mi) in the select phase. Ignored.
 Line: 1 Col: 54 Unexpected end tag (mi) in the select phase. Ignored.
 Line: 1 Col: 58 Unexpected start tag token (mi) in the select phase. Ignored.
 Line: 1 Col: 66 Unexpected end tag (mi) in the select phase. Ignored.
 Line: 1 Col: 69 Unexpected start tag token (p) in the select phase. Ignored.
 Line: 1 Col: 80 Unexpected table element end tag (tables) in the select in table phase.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       "foobarbaz"
 |     <table>
 |     <p>
 |       "quux"
 #data
 <!DOCTYPE html><body></body></html><math><mi>foo</mi><mi>bar</mi><p>baz
 #errors
 Line: 1 Col: 41 Unexpected start tag (math).
 Line: 1 Col: 68 HTML start tag "p" in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |       <math mi>
 |         "bar"
 |     <p>
 |       "baz"
 #data
 <!DOCTYPE html><body></body><math><mi>foo</mi><mi>bar</mi><p>baz
 #errors
 Line: 1 Col: 34 Unexpected start tag token (math) in the after body phase.
 Line: 1 Col: 61 HTML start tag "p" in a foreign namespace context.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mi>
 |         "foo"
 |       <math mi>
 |         "bar"
 |     <p>
 |       "baz"
 #data
 <!DOCTYPE html><frameset><math><mi></mi><mi></mi><p><span>
 #errors
 Line: 1 Col: 31 Unexpected start tag token (math) in the frameset phase. Ignored.
 Line: 1 Col: 35 Unexpected start tag token (mi) in the frameset phase. Ignored.
 Line: 1 Col: 40 Unexpected end tag token (mi) in the frameset phase. Ignored.
 Line: 1 Col: 44 Unexpected start tag token (mi) in the frameset phase. Ignored.
 Line: 1 Col: 49 Unexpected end tag token (mi) in the frameset phase. Ignored.
 Line: 1 Col: 52 Unexpected start tag token (p) in the frameset phase. Ignored.
 Line: 1 Col: 58 Unexpected start tag token (span) in the frameset phase. Ignored.
 Line: 1 Col: 58 Expected closing tag. Unexpected end of file.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><frameset></frameset><math><mi></mi><mi></mi><p><span>
 #errors
 Line: 1 Col: 42 Unexpected start tag (math) in the after frameset phase. Ignored.
 Line: 1 Col: 46 Unexpected start tag (mi) in the after frameset phase. Ignored.
 Line: 1 Col: 51 Unexpected end tag (mi) in the after frameset phase. Ignored.
 Line: 1 Col: 55 Unexpected start tag (mi) in the after frameset phase. Ignored.
 Line: 1 Col: 60 Unexpected end tag (mi) in the after frameset phase. Ignored.
 Line: 1 Col: 63 Unexpected start tag (p) in the after frameset phase. Ignored.
 Line: 1 Col: 69 Unexpected start tag (span) in the after frameset phase. Ignored.
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!DOCTYPE html><body xlink:href=foo><math xlink:href=foo></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     <math math>
 |       xlink href="foo"
 #data
 <!DOCTYPE html><body xlink:href=foo xml:lang=en><math><mi xml:lang=en xlink:href=foo></mi></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     xml:lang="en"
 |     <math math>
 |       <math mi>
 |         xlink href="foo"
 |         xml lang="en"
 #data
 <!DOCTYPE html><body xlink:href=foo xml:lang=en><math><mi xml:lang=en xlink:href=foo /></math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     xml:lang="en"
 |     <math math>
 |       <math mi>
 |         xlink href="foo"
 |         xml lang="en"
 #data
 <!DOCTYPE html><body xlink:href=foo xml:lang=en><math><mi xml:lang=en xlink:href=foo />bar</math>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     xlink:href="foo"
 |     xml:lang="en"
 |     <math math>
 |       <math mi>
 |         xlink href="foo"
 |         xml lang="en"
 |       "bar"
--- a/lib/html5lib/tests/testdata/tree-construction/tests_innerHTML_1.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tests_innerHTML_1.dat
@ -1,741 +0,0 @@
 #data
 <body><span>
 #errors
 #document-fragment
 body
 #document
 | <span>
 #data
 <span><body>
 #errors
 #document-fragment
 body
 #document
 | <span>
 #data
 <span><body>
 #errors
 #document-fragment
 div
 #document
 | <span>
 #data
 <body><span>
 #errors
 #document-fragment
 html
 #document
 | <head>
 | <body>
 |   <span>
 #data
 <frameset><span>
 #errors
 #document-fragment
 body
 #document
 | <span>
 #data
 <span><frameset>
 #errors
 #document-fragment
 body
 #document
 | <span>
 #data
 <span><frameset>
 #errors
 #document-fragment
 div
 #document
 | <span>
 #data
 <frameset><span>
 #errors
 #document-fragment
 html
 #document
 | <head>
 | <frameset>
 #data
 <table><tr>
 #errors
 #document-fragment
 table
 #document
 | <tbody>
 |   <tr>
 #data
 </table><tr>
 #errors
 #document-fragment
 table
 #document
 | <tbody>
 |   <tr>
 #data
 <a>
 #errors
 #document-fragment
 table
 #document
 | <a>
 #data
 <a>
 #errors
 #document-fragment
 table
 #document
 | <a>
 #data
 <a><caption>a
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <caption>
 |   "a"
 #data
 <a><colgroup><col>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <colgroup>
 |   <col>
 #data
 <a><tbody><tr>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <tbody>
 |   <tr>
 #data
 <a><tfoot><tr>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <tfoot>
 |   <tr>
 #data
 <a><thead><tr>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <thead>
 |   <tr>
 #data
 <a><tr>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <tbody>
 |   <tr>
 #data
 <a><th>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <tbody>
 |   <tr>
 |     <th>
 #data
 <a><td>
 #errors
 #document-fragment
 table
 #document
 | <a>
 | <tbody>
 |   <tr>
 |     <td>
 #data
 <table></table><tbody>
 #errors
 #document-fragment
 caption
 #document
 | <table>
 #data
 </table><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 #data
 <span></table>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 #data
 </caption><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 #data
 <span></caption><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><caption><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><col><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><colgroup><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><html><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><tbody><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><td><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><tfoot><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><thead><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><th><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span><tr><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 <span></table><span>
 #errors
 #document-fragment
 caption
 #document
 | <span>
 |   <span>
 #data
 </colgroup><col>
 #errors
 #document-fragment
 colgroup
 #document
 | <col>
 #data
 <a><col>
 #errors
 #document-fragment
 colgroup
 #document
 | <col>
 #data
 <caption><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 <col><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 <colgroup><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 <tbody><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 <tfoot><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 <thead><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 </table><a>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 #data
 <a><tr>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 | <tr>
 #data
 <a><td>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 | <tr>
 |   <td>
 #data
 <a><td>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 | <tr>
 |   <td>
 #data
 <a><td>
 #errors
 #document-fragment
 tbody
 #document
 | <a>
 | <tr>
 |   <td>
 #data
 <td><table><tbody><a><tr>
 #errors
 #document-fragment
 tbody
 #document
 | <tr>
 |   <td>
 |     <a>
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 </tr><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <td><table><a><tr></tr><tr>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 |   <a>
 |   <table>
 |     <tbody>
 |       <tr>
 |       <tr>
 #data
 <caption><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <col><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <colgroup><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <tbody><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <tfoot><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <thead><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <tr><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 </table><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 #data
 <td><table></table><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 |   <table>
 | <td>
 #data
 <td><table></table><td>
 #errors
 #document-fragment
 tr
 #document
 | <td>
 |   <table>
 | <td>
 #data
 <caption><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <col><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <colgroup><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <tbody><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <tfoot><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <th><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <thead><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <tr><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </table><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </tbody><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </td><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </tfoot><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </thead><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </th><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 </tr><a>
 #errors
 #document-fragment
 td
 #document
 | <a>
 #data
 <table><td><td>
 #errors
 #document-fragment
 td
 #document
 | <table>
 |   <tbody>
 |     <tr>
 |       <td>
 |       <td>
 #data
 </select><option>
 #errors
 #document-fragment
 select
 #document
 | <option>
 #data
 <input><option>
 #errors
 #document-fragment
 select
 #document
 | <option>
 #data
 <keygen><option>
 #errors
 #document-fragment
 select
 #document
 | <option>
 #data
 <textarea><option>
 #errors
 #document-fragment
 select
 #document
 | <option>
 #data
 </html><!--abc-->
 #errors
 #document-fragment
 html
 #document
 | <head>
 | <body>
 | <!-- abc -->
 #data
 </frameset><frame>
 #errors
 #document-fragment
 frameset
 #document
 | <frame>
 #data
 #errors
 #document-fragment
 html
 #document
 | <head>
 | <body>
--- a/lib/html5lib/tests/testdata/tree-construction/tricky01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/tricky01.dat
@ -1,261 +0,0 @@
 #data
 <b><p>Bold </b> Not bold</p>
 Also not bold.
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <b>
 |     <p>
 |       <b>
 |         "Bold "
 |       " Not bold"
 |     "
 Also not bold."
 #data
 <html>
 <font color=red><i>Italic and Red<p>Italic and Red </font> Just italic.</p> Italic only.</i> Plain
 <p>I should not be red. <font color=red>Red. <i>Italic and red.</p>
 <p>Italic and red. </i> Red.</font> I should not be red.</p>
 <b>Bold <i>Bold and italic</b> Only Italic </i> Plain
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <font>
 |       color="red"
 |       <i>
 |         "Italic and Red"
 |     <i>
 |       <p>
 |         <font>
 |           color="red"
 |           "Italic and Red "
 |         " Just italic."
 |       " Italic only."
 |     " Plain
 "
 |     <p>
 |       "I should not be red. "
 |       <font>
 |         color="red"
 |         "Red. "
 |         <i>
 |           "Italic and red."
 |     <font>
 |       color="red"
 |       <i>
 |         "
 "
 |     <p>
 |       <font>
 |         color="red"
 |         <i>
 |           "Italic and red. "
 |         " Red."
 |       " I should not be red."
 |     "
 "
 |     <b>
 |       "Bold "
 |       <i>
 |         "Bold and italic"
 |     <i>
 |       " Only Italic "
 |     " Plain"
 #data
 <html><body>
 <p><font size="7">First paragraph.</p>
 <p>Second paragraph.</p></font>
 <b><p><i>Bold and Italic</b> Italic</p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "
 "
 |     <p>
 |       <font>
 |         size="7"
 |         "First paragraph."
 |     <font>
 |       size="7"
 |       "
 "
 |       <p>
 |         "Second paragraph."
 |     "
 "
 |     <b>
 |     <p>
 |       <b>
 |         <i>
 |           "Bold and Italic"
 |       <i>
 |         " Italic"
 #data
 <html>
 <dl>
 <dt><b>Boo
 <dd>Goo?
 </dl>
 </html>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <dl>
 |       "
 "
 |       <dt>
 |         <b>
 |           "Boo
 "
 |       <dd>
 |         <b>
 |           "Goo?
 "
 |     <b>
 |       "
 "
 #data
 <html><body>
 <label><a><div>Hello<div>World</div></a></label>  
 </body></html>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "
 "
 |     <label>
 |       <a>
 |       <div>
 |         <a>
 |           "Hello"
 |           <div>
 |             "World"
 |         "  
 "
 #data
 <table><center> <font>a</center> <img> <tr><td> </td> </tr> </table>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <center>
 |       " "
 |       <font>
 |         "a"
 |     <font>
 |       <img>
 |       " "
 |     <table>
 |       " "
 |       <tbody>
 |         <tr>
 |           <td>
 |             " "
 |           " "
 |         " "
 #data
 <table><tr><p><a><p>You should see this text.
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       <a>
 |     <p>
 |       <a>
 |         "You should see this text."
 |     <table>
 |       <tbody>
 |         <tr>
 #data
 <TABLE>
 <TR>
 <CENTER><CENTER><TD></TD></TR><TR>
 <FONT>
 <TABLE><tr></tr></TABLE>
 </P>
 <a></font><font></a>
 This page contains an insanely badly-nested tag sequence.
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <center>
 |       <center>
 |     <font>
 |       "
 "
 |     <table>
 |       "
 "
 |       <tbody>
 |         <tr>
 |           "
 "
 |           <td>
 |         <tr>
 |           "
 "
 |     <table>
 |       <tbody>
 |         <tr>
 |     <font>
 |       "
 "
 |       <p>
 |       "
 "
 |       <a>
 |     <a>
 |       <font>
 |     <font>
 |       "
 This page contains an insanely badly-nested tag sequence."
 #data
 <html>
 <body>
 <b><nobr><div>This text is in a div inside a nobr</nobr>More text that should not be in the nobr, i.e., the
 nobr should have closed the div inside it implicitly. </b><pre>A pre tag outside everything else.</pre>
 </body>
 </html>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "
 "
 |     <b>
 |       <nobr>
 |     <div>
 |       <b>
 |         <nobr>
 |           "This text is in a div inside a nobr"
 |         "More text that should not be in the nobr, i.e., the
 nobr should have closed the div inside it implicitly. "
 |       <pre>
 |         "A pre tag outside everything else."
 |       "
 "
--- a/lib/html5lib/tests/testdata/tree-construction/webkit01.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/webkit01.dat
@ -1,594 +0,0 @@
 #data
 Test
 #errors
 Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE.
 #document
 | <html>
 |   <head>
 |   <body>
 |     "Test"
 #data
 <div></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 #data
 <div>Test</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "Test"
 #data
 <di
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 #data
 <div>Hello</div>
 <script>
 console.log("PASS");
 </script>
 <div>Bye</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "Hello"
 |     "
 "
 |     <script>
 |       "
 console.log("PASS");
 "
 |     "
 "
 |     <div>
 |       "Bye"
 #data
 <div foo="bar">Hello</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       foo="bar"
 |       "Hello"
 #data
 <div>Hello</div>
 <script>
 console.log("FOO<span>BAR</span>BAZ");
 </script>
 <div>Bye</div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       "Hello"
 |     "
 "
 |     <script>
 |       "
 console.log("FOO<span>BAR</span>BAZ");
 "
 |     "
 "
 |     <div>
 |       "Bye"
 #data
 <foo bar="baz"></foo><potato quack="duck"></potato>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |       bar="baz"
 |     <potato>
 |       quack="duck"
 #data
 <foo bar="baz"><potato quack="duck"></potato></foo>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |       bar="baz"
 |       <potato>
 |         quack="duck"
 #data
 <foo></foo bar="baz"><potato></potato quack="duck">
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |     <potato>
 #data
 </ tttt>
 #errors
 #document
 | <!--  tttt -->
 | <html>
 |   <head>
 |   <body>
 #data
 <div FOO ><img><img></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       foo=""
 |       <img>
 |       <img>
 #data
 <p>Test</p<p>Test2</p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       "TestTest2"
 #data
 <rdar://problem/6869687>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <rdar:>
 |       6869687=""
 |       problem=""
 #data
 <A>test< /A>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |       "test< /A>"
 #data
 &lt;
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "<"
 #data
 <body foo='bar'><body foo='baz' yo='mama'>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     foo="bar"
 |     yo="mama"
 #data
 <body></br foo="bar"></body>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <br>
 #data
 <bdy><br foo="bar"></body>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <bdy>
 |       <br>
 |         foo="bar"
 #data
 <body></body></br foo="bar">
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <br>
 #data
 <bdy></body><br foo="bar">
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <bdy>
 |       <br>
 |         foo="bar"
 #data
 <html><body></body></html><!-- Hi there -->
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 | <!--  Hi there  -->
 #data
 <html><body></body></html>x<!-- Hi there -->
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "x"
 |     <!--  Hi there  -->
 #data
 <html><body></body></html>x<!-- Hi there --></html><!-- Again -->
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "x"
 |     <!--  Hi there  -->
 | <!--  Again  -->
 #data
 <html><body></body></html>x<!-- Hi there --></body></html><!-- Again -->
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "x"
 |     <!--  Hi there  -->
 | <!--  Again  -->
 #data
 <html><body><ruby><div><rp>xx</rp></div></ruby></body></html>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <ruby>
 |       <div>
 |         <rp>
 |           "xx"
 #data
 <html><body><ruby><div><rt>xx</rt></div></ruby></body></html>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <ruby>
 |       <div>
 |         <rt>
 |           "xx"
 #data
 <html><frameset><!--1--><noframes>A</noframes><!--2--></frameset><!--3--><noframes>B</noframes><!--4--></html><!--5--><noframes>C</noframes><!--6-->
 #errors
 #document
 | <html>
 |   <head>
 |   <frameset>
 |     <!-- 1 -->
 |     <noframes>
 |       "A"
 |     <!-- 2 -->
 |   <!-- 3 -->
 |   <noframes>
 |     "B"
 |   <!-- 4 -->
 |   <noframes>
 |     "C"
 | <!-- 5 -->
 | <!-- 6 -->
 #data
 <select><option>A<select><option>B<select><option>C<select><option>D<select><option>E<select><option>F<select><option>G<select>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <select>
 |       <option>
 |         "A"
 |     <option>
 |       "B"
 |       <select>
 |         <option>
 |           "C"
 |     <option>
 |       "D"
 |       <select>
 |         <option>
 |           "E"
 |     <option>
 |       "F"
 |       <select>
 |         <option>
 |           "G"
 #data
 <dd><dd><dt><dt><dd><li><li>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <dd>
 |     <dd>
 |     <dt>
 |     <dt>
 |     <dd>
 |       <li>
 |       <li>
 #data
 <div><b></div><div><nobr>a<nobr>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <b>
 |     <div>
 |       <b>
 |         <nobr>
 |           "a"
 |         <nobr>
 #data
 <head></head>
 <body></body>
 #errors
 #document
 | <html>
 |   <head>
 |   "
 "
 |   <body>
 #data
 <head></head> <style></style>ddd
 #errors
 #document
 | <html>
 |   <head>
 |     <style>
 |   " "
 |   <body>
 |     "ddd"
 #data
 <kbd><table></kbd><col><select><tr>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <kbd>
 |       <select>
 |       <table>
 |         <colgroup>
 |           <col>
 |         <tbody>
 |           <tr>
 #data
 <kbd><table></kbd><col><select><tr></table><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <kbd>
 |       <select>
 |       <table>
 |         <colgroup>
 |           <col>
 |         <tbody>
 |           <tr>
 |       <div>
 #data
 <a><li><style></style><title></title></a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |     <li>
 |       <a>
 |         <style>
 |         <title>
 #data
 <font></p><p><meta><title></title></font>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <font>
 |       <p>
 |     <p>
 |       <font>
 |         <meta>
 |         <title>
 #data
 <a><center><title></title><a>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <a>
 |     <center>
 |       <a>
 |         <title>
 |       <a>
 #data
 <svg><title><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg title>
 |         <div>
 #data
 <svg><title><rect><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg title>
 |         <rect>
 |           <div>
 #data
 <svg><title><svg><div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg title>
 |         <svg svg>
 |         <div>
 #data
 <img <="" FAIL>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <img>
 |       <=""
 |       fail=""
 #data
 <ul><li><div id='foo'/>A</li><li>B<div>C</div></li></ul>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <ul>
 |       <li>
 |         <div>
 |           id="foo"
 |           "A"
 |       <li>
 |         "B"
 |         <div>
 |           "C"
 #data
 <svg><em><desc></em>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |     <em>
 |       <desc>
 #data
 <svg><tfoot></mi><td>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <svg svg>
 |       <svg tfoot>
 |         <svg td>
 #data
 <math><mrow><mrow><mn>1</mn></mrow><mi>a</mi></mrow></math>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <math math>
 |       <math mrow>
 |         <math mrow>
 |           <math mn>
 |             "1"
 |         <math mi>
 |           "a"
 #data
 <!doctype html><input type="hidden"><frameset>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <frameset>
 #data
 <!doctype html><input type="button"><frameset>
 #errors
 #document
 | <!DOCTYPE html>
 | <html>
 |   <head>
 |   <body>
 |     <input>
 |       type="button"
--- a/lib/html5lib/tests/testdata/tree-construction/webkit02.dat
+++ b/lib/html5lib/tests/testdata/tree-construction/webkit02.dat
@ -1,94 +0,0 @@
 #data
 <foo bar=qux/>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <foo>
 |       bar="qux/"
 #data
 <p id="status"><noscript><strong>A</strong></noscript><span>B</span></p>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <p>
 |       id="status"
 |       <noscript>
 |         "<strong>A</strong>"
 |       <span>
 |         "B"
 #data
 <div><sarcasm><div></div></sarcasm></div>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <div>
 |       <sarcasm>
 |         <div>
 #data
 <html><body><img src="" border="0" alt="><div>A</div></body></html>
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 #data
 <table><td></tbody>A
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     "A"
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 #data
 <table><td></thead>A
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "A"
 #data
 <table><td></tfoot>A
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <tbody>
 |         <tr>
 |           <td>
 |             "A"
 #data
 <table><thead><td></tbody>A
 #errors
 #document
 | <html>
 |   <head>
 |   <body>
 |     <table>
 |       <thead>
 |         <tr>
 |           <td>
 |             "A"
--- a/Show More
+++ b/Show More