SickRage/lib/html5lib/serializer/htmlserializer.py

try:
    frozenset
except NameError:
    # Import from the sets module for python 2.3
    from sets import ImmutableSet as frozenset

import gettext
_ = gettext.gettext

from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements, entities, xmlEntities
from html5lib import utils
from xml.sax.saxutils import escape

spaceCharacters = u"".join(spaceCharacters)

try:
    from codecs import register_error, xmlcharrefreplace_errors
except ImportError:
    unicode_encode_errors = "strict"
else:
    unicode_encode_errors = "htmlentityreplace"

    from html5lib.constants import entities

    encode_entity_map = {}
    is_ucs4 = len(u"\U0010FFFF") == 1
    for k, v in entities.items():
        #skip multi-character entities
        if ((is_ucs4 and len(v) > 1) or
            (not is_ucs4 and len(v) > 2)):
            continue
        if v != "&":
            if len(v) == 2:
                v = utils.surrogatePairToCodepoint(v)
            else:
                try:
                    v = ord(v)
                except:
                    print v
                    raise
            if not v in encode_entity_map or k.islower():
                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
                encode_entity_map[v] = k

    def htmlentityreplace_errors(exc):
        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
            res = []
            codepoints = []
            skip = False
            for i, c in enumerate(exc.object[exc.start:exc.end]):
                if skip:
                    skip = False
                    continue
                index = i + exc.start
                if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
                    skip = True
                else:
                    codepoint = ord(c)
                codepoints.append(codepoint)
            for cp in codepoints:
                e = encode_entity_map.get(cp)
                if e:
                    res.append("&")
                    res.append(e)
                    if not e.endswith(";"):
                        res.append(";")
                else:
                    res.append("&#x%s;"%(hex(cp)[2:]))
            return (u"".join(res), exc.end)
        else:
            return xmlcharrefreplace_errors(exc)

    register_error(unicode_encode_errors, htmlentityreplace_errors)

    del register_error


class HTMLSerializer(object):

    # attribute quoting options
    quote_attr_values = False
    quote_char = u'"'
    use_best_quote_char = True

    # tag syntax options
    omit_optional_tags = True
    minimize_boolean_attributes = True
    use_trailing_solidus = False
    space_before_trailing_solidus = True

    # escaping options
    escape_lt_in_attrs = False
    escape_rcdata = False
    resolve_entities = True

    # miscellaneous options
    inject_meta_charset = True
    strip_whitespace = False
    sanitize = False

    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
          "minimize_boolean_attributes", "use_trailing_solidus",
          "space_before_trailing_solidus", "omit_optional_tags",
          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
          "escape_rcdata", "resolve_entities", "sanitize")

    def __init__(self, **kwargs):
        """Initialize HTMLSerializer.

        Keyword options (default given first unless specified) include:

        inject_meta_charset=True|False
          Whether it insert a meta element to define the character set of the
          document.
        quote_attr_values=True|False
          Whether to quote attribute values that don't require quoting
          per HTML5 parsing rules.
        quote_char=u'"'|u"'"
          Use given quote character for attribute quoting. Default is to
          use double quote unless attribute value contains a double quote,
          in which case single quotes are used instead.
        escape_lt_in_attrs=False|True
          Whether to escape < in attribute values.
        escape_rcdata=False|True
          Whether to escape characters that need to be escaped within normal
          elements within rcdata elements such as style.
        resolve_entities=True|False
          Whether to resolve named character entities that appear in the
          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
          are unaffected by this setting.
        strip_whitespace=False|True
          Whether to remove semantically meaningless whitespace. (This
          compresses all whitespace to a single space except within pre.)
        minimize_boolean_attributes=True|False
          Shortens boolean attributes to give just the attribute value,
          for example <input disabled="disabled"> becomes <input disabled>.
        use_trailing_solidus=False|True
          Includes a close-tag slash at the end of the start tag of void
          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
        space_before_trailing_solidus=True|False
          Places a space immediately before the closing slash in a tag
          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
        sanitize=False|True
          Strip all unsafe or unknown constructs from output.
          See `html5lib user documentation`_
        omit_optional_tags=True|False
          Omit start/end tags that are optional.

        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
        """
        if kwargs.has_key('quote_char'):
            self.use_best_quote_char = False
        for attr in self.options:
            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
        self.errors = []
        self.strict = False

    def encode(self, string):
        assert(isinstance(string, unicode))
        if self.encoding:
            return string.encode(self.encoding, unicode_encode_errors)
        else:
            return string

    def encodeStrict(self, string):
        assert(isinstance(string, unicode))
        if self.encoding:
            return string.encode(self.encoding, "strict")
        else:
            return string

    def serialize(self, treewalker, encoding=None):
        self.encoding = encoding
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
                doctype = u"<!DOCTYPE %s" % token["name"]
                
                if token["publicId"]:
                    doctype += u' PUBLIC "%s"' % token["publicId"]
                elif token["systemId"]:
                    doctype += u" SYSTEM"
                if token["systemId"]:                
                    if token["systemId"].find(u'"') >= 0:
                        if token["systemId"].find(u"'") >= 0:
                            self.serializeError(_("System identifer contains both single and double quote characters"))
                        quote_char = u"'"
                    else:
                        quote_char = u'"'
                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
                
                doctype += u">"
                yield self.encodeStrict(doctype)

            elif type in ("Characters", "SpaceCharacters"):
                if type == "SpaceCharacters" or in_cdata:
                    if in_cdata and token["data"].find("</") >= 0:
                        self.serializeError(_("Unexpected </ in CDATA"))
                    yield self.encode(token["data"])
                else:
                    yield self.encode(escape(token["data"]))

            elif type in ("StartTag", "EmptyTag"):
                name = token["name"]
                yield self.encodeStrict(u"<%s" % name)
                if name in rcdataElements and not self.escape_rcdata:
                    in_cdata = True
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
                attributes = []
                for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
                    #TODO: Add namespace support here
                    k = attr_name
                    v = attr_value
                    yield self.encodeStrict(u' ')

                    yield self.encodeStrict(k)
                    if not self.minimize_boolean_attributes or \
                      (k not in booleanAttributes.get(name, tuple()) \
                      and k not in booleanAttributes.get("", tuple())):
                        yield self.encodeStrict(u"=")
                        if self.quote_attr_values or not v:
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x,y: x or (y in v),
                                spaceCharacters + u">\"'=", False)
                        v = v.replace(u"&", u"&amp;")
                        if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
                        if quote_attr:
                            quote_char = self.quote_char
                            if self.use_best_quote_char:
                                if u"'" in v and u'"' not in v:
                                    quote_char = u'"'
                                elif u'"' in v and u"'" not in v:
                                    quote_char = u"'"
                            if quote_char == u"'":
                                v = v.replace(u"'", u"&#39;")
                            else:
                                v = v.replace(u'"', u"&quot;")
                            yield self.encodeStrict(quote_char)
                            yield self.encode(v)
                            yield self.encodeStrict(quote_char)
                        else:
                            yield self.encode(v)
                if name in voidElements and self.use_trailing_solidus:
                    if self.space_before_trailing_solidus:
                        yield self.encodeStrict(u" /")
                    else:
                        yield self.encodeStrict(u"/")
                yield self.encode(u">")

            elif type == "EndTag":
                name = token["name"]
                if name in rcdataElements:
                    in_cdata = False
                elif in_cdata:
                    self.serializeError(_("Unexpected child element of a CDATA element"))
                yield self.encodeStrict(u"</%s>" % name)

            elif type == "Comment":
                data = token["data"]
                if data.find("--") >= 0:
                    self.serializeError(_("Comment contains --"))
                yield self.encodeStrict(u"<!--%s-->" % token["data"])

            elif type == "Entity":
                name = token["name"]
                key = name + ";"
                if not key in entities:
                    self.serializeError(_("Entity %s not recognized" % name))
                if self.resolve_entities and key not in xmlEntities:
                    data = entities[key]
                else:
                    data = u"&%s;" % name
                yield self.encodeStrict(data)

            else:
                self.serializeError(token["data"])

    def render(self, treewalker, encoding=None):
        if encoding:
            return "".join(list(self.serialize(treewalker, encoding)))
        else:
            return u"".join(list(self.serialize(treewalker)))

    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
        # XXX The idea is to make data mandatory.
        self.errors.append(data)
        if self.strict:
            raise SerializeError

def SerializeError(Exception):
    """Error in serialized tree"""
    pass
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 2014-03-10 01:18:05 -04:00			`try:`
			`frozenset`
			`except NameError:`
			`# Import from the sets module for python 2.3`
			`from sets import ImmutableSet as frozenset`

			`import gettext`
			`_ = gettext.gettext`

			`from html5lib.constants import voidElements, booleanAttributes, spaceCharacters`
			`from html5lib.constants import rcdataElements, entities, xmlEntities`
			`from html5lib import utils`
			`from xml.sax.saxutils import escape`

			`spaceCharacters = u"".join(spaceCharacters)`

			`try:`
			`from codecs import register_error, xmlcharrefreplace_errors`
			`except ImportError:`
			`unicode_encode_errors = "strict"`
			`else:`
			`unicode_encode_errors = "htmlentityreplace"`

			`from html5lib.constants import entities`

			`encode_entity_map = {}`
			`is_ucs4 = len(u"\U0010FFFF") == 1`
			`for k, v in entities.items():`
			`#skip multi-character entities`
			`if ((is_ucs4 and len(v) > 1) or`
			`(not is_ucs4 and len(v) > 2)):`
			`continue`
			`if v != "&":`
			`if len(v) == 2:`
			`v = utils.surrogatePairToCodepoint(v)`
			`else:`
			`try:`
			`v = ord(v)`
			`except:`
			`print v`
			`raise`
			`if not v in encode_entity_map or k.islower():`
			`# prefer < over &LT; and similarly for &, >, etc.`
			`encode_entity_map[v] = k`

			`def htmlentityreplace_errors(exc):`
			`if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):`
			`res = []`
			`codepoints = []`
			`skip = False`
			`for i, c in enumerate(exc.object[exc.start:exc.end]):`
			`if skip:`
			`skip = False`
			`continue`
			`index = i + exc.start`
			`if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):`
			`codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])`
			`skip = True`
			`else:`
			`codepoint = ord(c)`
			`codepoints.append(codepoint)`
			`for cp in codepoints:`
			`e = encode_entity_map.get(cp)`
			`if e:`
			`res.append("&")`
			`res.append(e)`
			`if not e.endswith(";"):`
			`res.append(";")`
			`else:`
			`res.append("&#x%s;"%(hex(cp)[2:]))`
			`return (u"".join(res), exc.end)`
			`else:`
			`return xmlcharrefreplace_errors(exc)`

			`register_error(unicode_encode_errors, htmlentityreplace_errors)`

			`del register_error`


			`class HTMLSerializer(object):`

			`# attribute quoting options`
			`quote_attr_values = False`
			`quote_char = u'"'`
			`use_best_quote_char = True`

			`# tag syntax options`
			`omit_optional_tags = True`
			`minimize_boolean_attributes = True`
			`use_trailing_solidus = False`
			`space_before_trailing_solidus = True`

			`# escaping options`
			`escape_lt_in_attrs = False`
			`escape_rcdata = False`
			`resolve_entities = True`

			`# miscellaneous options`
			`inject_meta_charset = True`
			`strip_whitespace = False`
			`sanitize = False`

			`options = ("quote_attr_values", "quote_char", "use_best_quote_char",`
			`"minimize_boolean_attributes", "use_trailing_solidus",`
			`"space_before_trailing_solidus", "omit_optional_tags",`
			`"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",`
			`"escape_rcdata", "resolve_entities", "sanitize")`

			`def __init__(self, **kwargs):`
			`"""Initialize HTMLSerializer.`

			`Keyword options (default given first unless specified) include:`

			`inject_meta_charset=True\|False`
			`Whether it insert a meta element to define the character set of the`
			`document.`
			`quote_attr_values=True\|False`
			`Whether to quote attribute values that don't require quoting`
			`per HTML5 parsing rules.`
			`quote_char=u'"'\|u"'"`
			`Use given quote character for attribute quoting. Default is to`
			`use double quote unless attribute value contains a double quote,`
			`in which case single quotes are used instead.`
			`escape_lt_in_attrs=False\|True`
			`Whether to escape < in attribute values.`
			`escape_rcdata=False\|True`
			`Whether to escape characters that need to be escaped within normal`
			`elements within rcdata elements such as style.`
			`resolve_entities=True\|False`
			`Whether to resolve named character entities that appear in the`
			`source tree. The XML predefined entities < > & " '`
			`are unaffected by this setting.`
			`strip_whitespace=False\|True`
			`Whether to remove semantically meaningless whitespace. (This`
			`compresses all whitespace to a single space except within pre.)`
			`minimize_boolean_attributes=True\|False`
			`Shortens boolean attributes to give just the attribute value,`
			`for example <input disabled="disabled"> becomes <input disabled>.`
			`use_trailing_solidus=False\|True`
			`Includes a close-tag slash at the end of the start tag of void`
			`elements (empty elements whose end tag is forbidden). E.g. <hr/>.`
			`space_before_trailing_solidus=True\|False`
			`Places a space immediately before the closing slash in a tag`
			`using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.`
			`sanitize=False\|True`
			`Strip all unsafe or unknown constructs from output.`
			See `html5lib user documentation`_
			`omit_optional_tags=True\|False`
			`Omit start/end tags that are optional.`

			`.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation`
			`"""`
			`if kwargs.has_key('quote_char'):`
			`self.use_best_quote_char = False`
			`for attr in self.options:`
			`setattr(self, attr, kwargs.get(attr, getattr(self, attr)))`
			`self.errors = []`
			`self.strict = False`

			`def encode(self, string):`
			`assert(isinstance(string, unicode))`
			`if self.encoding:`
			`return string.encode(self.encoding, unicode_encode_errors)`
			`else:`
			`return string`

			`def encodeStrict(self, string):`
			`assert(isinstance(string, unicode))`
			`if self.encoding:`
			`return string.encode(self.encoding, "strict")`
			`else:`
			`return string`

			`def serialize(self, treewalker, encoding=None):`
			`self.encoding = encoding`
			`in_cdata = False`
			`self.errors = []`
			`if encoding and self.inject_meta_charset:`
			`from html5lib.filters.inject_meta_charset import Filter`
			`treewalker = Filter(treewalker, encoding)`
			`# XXX: WhitespaceFilter should be used before OptionalTagFilter`
			`# for maximum efficiently of this latter filter`
			`if self.strip_whitespace:`
			`from html5lib.filters.whitespace import Filter`
			`treewalker = Filter(treewalker)`
			`if self.sanitize:`
			`from html5lib.filters.sanitizer import Filter`
			`treewalker = Filter(treewalker)`
			`if self.omit_optional_tags:`
			`from html5lib.filters.optionaltags import Filter`
			`treewalker = Filter(treewalker)`
			`for token in treewalker:`
			`type = token["type"]`
			`if type == "Doctype":`
			`doctype = u"<!DOCTYPE %s" % token["name"]`

			`if token["publicId"]:`
			`doctype += u' PUBLIC "%s"' % token["publicId"]`
			`elif token["systemId"]:`
			`doctype += u" SYSTEM"`
			`if token["systemId"]:`
			`if token["systemId"].find(u'"') >= 0:`
			`if token["systemId"].find(u"'") >= 0:`
			`self.serializeError(_("System identifer contains both single and double quote characters"))`
			`quote_char = u"'"`
			`else:`
			`quote_char = u'"'`
			`doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)`

			`doctype += u">"`
			`yield self.encodeStrict(doctype)`

			`elif type in ("Characters", "SpaceCharacters"):`
			`if type == "SpaceCharacters" or in_cdata:`
			`if in_cdata and token["data"].find("</") >= 0:`
			`self.serializeError(_("Unexpected </ in CDATA"))`
			`yield self.encode(token["data"])`
			`else:`
			`yield self.encode(escape(token["data"]))`

			`elif type in ("StartTag", "EmptyTag"):`
			`name = token["name"]`
			`yield self.encodeStrict(u"<%s" % name)`
			`if name in rcdataElements and not self.escape_rcdata:`
			`in_cdata = True`
			`elif in_cdata:`
			`self.serializeError(_("Unexpected child element of a CDATA element"))`
			`attributes = []`
			`for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):`
			`#TODO: Add namespace support here`
			`k = attr_name`
			`v = attr_value`
			`yield self.encodeStrict(u' ')`

			`yield self.encodeStrict(k)`
			`if not self.minimize_boolean_attributes or \`
			`(k not in booleanAttributes.get(name, tuple()) \`
			`and k not in booleanAttributes.get("", tuple())):`
			`yield self.encodeStrict(u"=")`
			`if self.quote_attr_values or not v:`
			`quote_attr = True`
			`else:`
			`quote_attr = reduce(lambda x,y: x or (y in v),`
			`spaceCharacters + u">\"'=", False)`
			`v = v.replace(u"&", u"&")`
			`if self.escape_lt_in_attrs: v = v.replace(u"<", u"<")`
			`if quote_attr:`
			`quote_char = self.quote_char`
			`if self.use_best_quote_char:`
			`if u"'" in v and u'"' not in v:`
			`quote_char = u'"'`
			`elif u'"' in v and u"'" not in v:`
			`quote_char = u"'"`
			`if quote_char == u"'":`
			`v = v.replace(u"'", u"'")`
			`else:`
			`v = v.replace(u'"', u""")`
			`yield self.encodeStrict(quote_char)`
			`yield self.encode(v)`
			`yield self.encodeStrict(quote_char)`
			`else:`
			`yield self.encode(v)`
			`if name in voidElements and self.use_trailing_solidus:`
			`if self.space_before_trailing_solidus:`
			`yield self.encodeStrict(u" /")`
			`else:`
			`yield self.encodeStrict(u"/")`
			`yield self.encode(u">")`

			`elif type == "EndTag":`
			`name = token["name"]`
			`if name in rcdataElements:`
			`in_cdata = False`
			`elif in_cdata:`
			`self.serializeError(_("Unexpected child element of a CDATA element"))`
			`yield self.encodeStrict(u"</%s>" % name)`

			`elif type == "Comment":`
			`data = token["data"]`
			`if data.find("--") >= 0:`
			`self.serializeError(_("Comment contains --"))`
			`yield self.encodeStrict(u"<!--%s-->" % token["data"])`

			`elif type == "Entity":`
			`name = token["name"]`
			`key = name + ";"`
			`if not key in entities:`
			`self.serializeError(_("Entity %s not recognized" % name))`
			`if self.resolve_entities and key not in xmlEntities:`
			`data = entities[key]`
			`else:`
			`data = u"&%s;" % name`
			`yield self.encodeStrict(data)`

			`else:`
			`self.serializeError(token["data"])`

			`def render(self, treewalker, encoding=None):`
			`if encoding:`
			`return "".join(list(self.serialize(treewalker, encoding)))`
			`else:`
			`return u"".join(list(self.serialize(treewalker)))`

			`def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):`
			`# XXX The idea is to make data mandatory.`
			`self.errors.append(data)`
			`if self.strict:`
			`raise SerializeError`

			`def SerializeError(Exception):`
			`"""Error in serialized tree"""`
			`pass`