import os
import unittest
from support import html5lib_test_files
try:
import json
except ImportError:
import simplejson as json
import html5lib
from html5lib import html5parser, serializer, constants
from html5lib.treewalkers._base import TreeWalker
optionals_loaded = []
try:
from lxml import etree
optionals_loaded.append("lxml")
except ImportError:
pass
default_namespace = constants.namespaces["html"]
class JsonWalker(TreeWalker):
def __iter__(self):
for token in self.tree:
type = token[0]
if type == "StartTag":
if len(token) == 4:
namespace, name, attrib = token[1:4]
else:
namespace = default_namespace
name, attrib = token[1:3]
yield self.startTag(namespace, name, self._convertAttrib(attrib))
elif type == "EndTag":
if len(token) == 3:
namespace, name = token[1:3]
else:
namespace = default_namespace
name = token[1]
yield self.endTag(namespace, name)
elif type == "EmptyTag":
if len(token) == 4:
namespace, name, attrib = token[1:]
else:
namespace = default_namespace
name, attrib = token[1:]
for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)):
yield token
elif type == "Comment":
yield self.comment(token[1])
elif type in ("Characters", "SpaceCharacters"):
for token in self.text(token[1]):
yield token
elif type == "Doctype":
if len(token) == 4:
yield self.doctype(token[1], token[2], token[3])
elif len(token) == 3:
yield self.doctype(token[1], token[2])
else:
yield self.doctype(token[1])
else:
raise ValueError("Unknown token type: " + type)
def _convertAttrib(self, attribs):
"""html5lib tree-walkers use a dict of (namespace, name): value for
attributes, but JSON cannot represent this. Convert from the format
in the serializer tests (a list of dicts with "namespace", "name",
and "value" as keys) to html5lib's tree-walker format."""
attrs = {}
for attrib in attribs:
name = (attrib["namespace"], attrib["name"])
assert(name not in attrs)
attrs[name] = attrib["value"]
return attrs
def serialize_html(input, options):
options = dict([(str(k),v) for k,v in options.iteritems()])
return serializer.HTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
def serialize_xhtml(input, options):
options = dict([(str(k),v) for k,v in options.iteritems()])
return serializer.XHTMLSerializer(**options).render(JsonWalker(input),options.get("encoding",None))
def make_test(input, expected, xhtml, options):
result = serialize_html(input, options)
if len(expected) == 1:
assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:False\n%s"%(expected[0], result, str(options))
elif result not in expected:
assert False, "Expected: %s, Received: %s" % (expected, result)
if not xhtml:
return
result = serialize_xhtml(input, options)
if len(xhtml) == 1:
assert xhtml[0] == result, "Expected:\n%s\nActual:\n%s\nOptions\nxhtml:True\n%s"%(xhtml[0], result, str(options))
elif result not in xhtml:
assert False, "Expected: %s, Received: %s" % (xhtml, result)
class EncodingTestCase(unittest.TestCase):
def throwsWithLatin1(self, input):
self.assertRaises(UnicodeEncodeError, serialize_html, input, {"encoding": "iso-8859-1"})
def testDoctypeName(self):
self.throwsWithLatin1([["Doctype", u"\u0101"]])
def testDoctypePublicId(self):
self.throwsWithLatin1([["Doctype", u"potato", u"\u0101"]])
def testDoctypeSystemId(self):
self.throwsWithLatin1([["Doctype", u"potato", u"potato", u"\u0101"]])
def testCdataCharacters(self):
self.assertEquals("