mirror of
https://github.com/moparisthebest/SickRage
synced 2024-11-10 19:35:08 -05:00
0d9fbc1ad7
This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy!
2734 lines
113 KiB
Python
2734 lines
113 KiB
Python
try:
|
|
frozenset
|
|
except NameError:
|
|
# Import from the sets module for python 2.3
|
|
from sets import Set as set
|
|
from sets import ImmutableSet as frozenset
|
|
|
|
try:
|
|
any
|
|
except:
|
|
# Implement 'any' for python 2.4 and previous
|
|
def any(iterable):
|
|
for element in iterable:
|
|
if element:
|
|
return True
|
|
return False
|
|
|
|
try:
|
|
"abc".startswith(("a", "b"))
|
|
def startswithany(str, prefixes):
|
|
return str.startswith(prefixes)
|
|
except:
|
|
# Python 2.4 doesn't accept a tuple as argument to string startswith
|
|
def startswithany(str, prefixes):
|
|
for prefix in prefixes:
|
|
if str.startswith(prefix):
|
|
return True
|
|
return False
|
|
|
|
import sys
|
|
import types
|
|
|
|
import inputstream
|
|
import tokenizer
|
|
|
|
import treebuilders
|
|
from treebuilders._base import Marker
|
|
from treebuilders import simpletree
|
|
|
|
import utils
|
|
import constants
|
|
from constants import spaceCharacters, asciiUpper2Lower
|
|
from constants import formattingElements, specialElements
|
|
from constants import headingElements, tableInsertModeElements
|
|
from constants import cdataElements, rcdataElements, voidElements
|
|
from constants import tokenTypes, ReparseException, namespaces, spaceCharacters
|
|
from constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
|
|
|
|
def parse(doc, treebuilder="simpletree", encoding=None,
|
|
namespaceHTMLElements=True):
|
|
"""Parse a string or file-like object into a tree"""
|
|
tb = treebuilders.getTreeBuilder(treebuilder)
|
|
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
|
return p.parse(doc, encoding=encoding)
|
|
|
|
def parseFragment(doc, container="div", treebuilder="simpletree", encoding=None,
|
|
namespaceHTMLElements=True):
|
|
tb = treebuilders.getTreeBuilder(treebuilder)
|
|
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
|
|
return p.parseFragment(doc, container=container, encoding=encoding)
|
|
|
|
def method_decorator_metaclass(function):
|
|
class Decorated(type):
|
|
def __new__(meta, classname, bases, classDict):
|
|
for attributeName, attribute in classDict.iteritems():
|
|
if type(attribute) == types.FunctionType:
|
|
attribute = function(attribute)
|
|
|
|
classDict[attributeName] = attribute
|
|
return type.__new__(meta, classname, bases, classDict)
|
|
return Decorated
|
|
|
|
class HTMLParser(object):
|
|
"""HTML parser. Generates a tree structure from a stream of (possibly
|
|
malformed) HTML"""
|
|
|
|
def __init__(self, tree = simpletree.TreeBuilder,
|
|
tokenizer = tokenizer.HTMLTokenizer, strict = False,
|
|
namespaceHTMLElements = True, debug=False):
|
|
"""
|
|
strict - raise an exception when a parse error is encountered
|
|
|
|
tree - a treebuilder class controlling the type of tree that will be
|
|
returned. Built in treebuilders can be accessed through
|
|
html5lib.treebuilders.getTreeBuilder(treeType)
|
|
|
|
tokenizer - a class that provides a stream of tokens to the treebuilder.
|
|
This may be replaced for e.g. a sanitizer which converts some tags to
|
|
text
|
|
"""
|
|
|
|
# Raise an exception on the first error encountered
|
|
self.strict = strict
|
|
|
|
self.tree = tree(namespaceHTMLElements)
|
|
self.tokenizer_class = tokenizer
|
|
self.errors = []
|
|
|
|
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
|
|
getPhases(debug).iteritems()])
|
|
|
|
def _parse(self, stream, innerHTML=False, container="div",
|
|
encoding=None, parseMeta=True, useChardet=True, **kwargs):
|
|
|
|
self.innerHTMLMode = innerHTML
|
|
self.container = container
|
|
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
|
|
parseMeta=parseMeta,
|
|
useChardet=useChardet,
|
|
parser=self, **kwargs)
|
|
self.reset()
|
|
|
|
while True:
|
|
try:
|
|
self.mainLoop()
|
|
break
|
|
except ReparseException, e:
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
self.tree.reset()
|
|
self.firstStartTag = False
|
|
self.errors = []
|
|
self.log = [] #only used with debug mode
|
|
# "quirks" / "limited quirks" / "no quirks"
|
|
self.compatMode = "no quirks"
|
|
|
|
if self.innerHTMLMode:
|
|
self.innerHTML = self.container.lower()
|
|
|
|
if self.innerHTML in cdataElements:
|
|
self.tokenizer.state = self.tokenizer.rcdataState
|
|
elif self.innerHTML in rcdataElements:
|
|
self.tokenizer.state = self.tokenizer.rawtextState
|
|
elif self.innerHTML == 'plaintext':
|
|
self.tokenizer.state = self.tokenizer.plaintextState
|
|
else:
|
|
# state already is data state
|
|
# self.tokenizer.state = self.tokenizer.dataState
|
|
pass
|
|
self.phase = self.phases["beforeHtml"]
|
|
self.phase.insertHtmlElement()
|
|
self.resetInsertionMode()
|
|
else:
|
|
self.innerHTML = False
|
|
self.phase = self.phases["initial"]
|
|
|
|
self.lastPhase = None
|
|
|
|
self.beforeRCDataPhase = None
|
|
|
|
self.framesetOK = True
|
|
|
|
def isHTMLIntegrationPoint(self, element):
|
|
if (element.name == "annotation-xml" and
|
|
element.namespace == namespaces["mathml"]):
|
|
return ("encoding" in element.attributes and
|
|
element.attributes["encoding"].translate(
|
|
asciiUpper2Lower) in
|
|
("text/html", "application/xhtml+xml"))
|
|
else:
|
|
return (element.namespace, element.name) in htmlIntegrationPointElements
|
|
|
|
def isMathMLTextIntegrationPoint(self, element):
|
|
return (element.namespace, element.name) in mathmlTextIntegrationPointElements
|
|
|
|
def mainLoop(self):
|
|
CharactersToken = tokenTypes["Characters"]
|
|
SpaceCharactersToken = tokenTypes["SpaceCharacters"]
|
|
StartTagToken = tokenTypes["StartTag"]
|
|
EndTagToken = tokenTypes["EndTag"]
|
|
CommentToken = tokenTypes["Comment"]
|
|
DoctypeToken = tokenTypes["Doctype"]
|
|
ParseErrorToken = tokenTypes["ParseError"]
|
|
|
|
for token in self.normalizedTokens():
|
|
new_token = token
|
|
while new_token is not None:
|
|
currentNode = self.tree.openElements[-1] if self.tree.openElements else None
|
|
currentNodeNamespace = currentNode.namespace if currentNode else None
|
|
currentNodeName = currentNode.name if currentNode else None
|
|
|
|
type = new_token["type"]
|
|
|
|
if type == ParseErrorToken:
|
|
self.parseError(new_token["data"], new_token.get("datavars", {}))
|
|
new_token = None
|
|
else:
|
|
if (len(self.tree.openElements) == 0 or
|
|
currentNodeNamespace == self.tree.defaultNamespace or
|
|
(self.isMathMLTextIntegrationPoint(currentNode) and
|
|
((type == StartTagToken and
|
|
token["name"] not in frozenset(["mglyph", "malignmark"])) or
|
|
type in (CharactersToken, SpaceCharactersToken))) or
|
|
(currentNodeNamespace == namespaces["mathml"] and
|
|
currentNodeName == "annotation-xml" and
|
|
token["name"] == "svg") or
|
|
(self.isHTMLIntegrationPoint(currentNode) and
|
|
type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
|
|
phase = self.phase
|
|
else:
|
|
phase = self.phases["inForeignContent"]
|
|
|
|
if type == CharactersToken:
|
|
new_token = phase.processCharacters(new_token)
|
|
elif type == SpaceCharactersToken:
|
|
new_token= phase.processSpaceCharacters(new_token)
|
|
elif type == StartTagToken:
|
|
new_token = phase.processStartTag(new_token)
|
|
elif type == EndTagToken:
|
|
new_token = phase.processEndTag(new_token)
|
|
elif type == CommentToken:
|
|
new_token = phase.processComment(new_token)
|
|
elif type == DoctypeToken:
|
|
new_token = phase.processDoctype(new_token)
|
|
|
|
if (type == StartTagToken and token["selfClosing"]
|
|
and not token["selfClosingAcknowledged"]):
|
|
self.parseError("non-void-element-with-trailing-solidus",
|
|
{"name":token["name"]})
|
|
|
|
|
|
# When the loop finishes it's EOF
|
|
reprocess = True
|
|
phases = []
|
|
while reprocess:
|
|
phases.append(self.phase)
|
|
reprocess = self.phase.processEOF()
|
|
if reprocess:
|
|
assert self.phase not in phases
|
|
|
|
def normalizedTokens(self):
|
|
for token in self.tokenizer:
|
|
yield self.normalizeToken(token)
|
|
|
|
def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
|
|
"""Parse a HTML document into a well-formed tree
|
|
|
|
stream - a filelike object or string containing the HTML to be parsed
|
|
|
|
The optional encoding parameter must be a string that indicates
|
|
the encoding. If specified, that encoding will be used,
|
|
regardless of any BOM or later declaration (such as in a meta
|
|
element)
|
|
"""
|
|
self._parse(stream, innerHTML=False, encoding=encoding,
|
|
parseMeta=parseMeta, useChardet=useChardet)
|
|
return self.tree.getDocument()
|
|
|
|
def parseFragment(self, stream, container="div", encoding=None,
|
|
parseMeta=False, useChardet=True):
|
|
"""Parse a HTML fragment into a well-formed tree fragment
|
|
|
|
container - name of the element we're setting the innerHTML property
|
|
if set to None, default to 'div'
|
|
|
|
stream - a filelike object or string containing the HTML to be parsed
|
|
|
|
The optional encoding parameter must be a string that indicates
|
|
the encoding. If specified, that encoding will be used,
|
|
regardless of any BOM or later declaration (such as in a meta
|
|
element)
|
|
"""
|
|
self._parse(stream, True, container=container, encoding=encoding)
|
|
return self.tree.getFragment()
|
|
|
|
def parseError(self, errorcode="XXX-undefined-error", datavars={}):
|
|
# XXX The idea is to make errorcode mandatory.
|
|
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
|
|
if self.strict:
|
|
raise ParseError
|
|
|
|
def normalizeToken(self, token):
|
|
""" HTML5 specific normalizations to the token stream """
|
|
|
|
if token["type"] == tokenTypes["StartTag"]:
|
|
token["data"] = dict(token["data"][::-1])
|
|
|
|
return token
|
|
|
|
def adjustMathMLAttributes(self, token):
|
|
replacements = {"definitionurl":u"definitionURL"}
|
|
for k,v in replacements.iteritems():
|
|
if k in token["data"]:
|
|
token["data"][v] = token["data"][k]
|
|
del token["data"][k]
|
|
|
|
def adjustSVGAttributes(self, token):
|
|
replacements = {
|
|
"attributename":u"attributeName",
|
|
"attributetype":u"attributeType",
|
|
"basefrequency":u"baseFrequency",
|
|
"baseprofile":u"baseProfile",
|
|
"calcmode":u"calcMode",
|
|
"clippathunits":u"clipPathUnits",
|
|
"contentscripttype":u"contentScriptType",
|
|
"contentstyletype":u"contentStyleType",
|
|
"diffuseconstant":u"diffuseConstant",
|
|
"edgemode":u"edgeMode",
|
|
"externalresourcesrequired":u"externalResourcesRequired",
|
|
"filterres":u"filterRes",
|
|
"filterunits":u"filterUnits",
|
|
"glyphref":u"glyphRef",
|
|
"gradienttransform":u"gradientTransform",
|
|
"gradientunits":u"gradientUnits",
|
|
"kernelmatrix":u"kernelMatrix",
|
|
"kernelunitlength":u"kernelUnitLength",
|
|
"keypoints":u"keyPoints",
|
|
"keysplines":u"keySplines",
|
|
"keytimes":u"keyTimes",
|
|
"lengthadjust":u"lengthAdjust",
|
|
"limitingconeangle":u"limitingConeAngle",
|
|
"markerheight":u"markerHeight",
|
|
"markerunits":u"markerUnits",
|
|
"markerwidth":u"markerWidth",
|
|
"maskcontentunits":u"maskContentUnits",
|
|
"maskunits":u"maskUnits",
|
|
"numoctaves":u"numOctaves",
|
|
"pathlength":u"pathLength",
|
|
"patterncontentunits":u"patternContentUnits",
|
|
"patterntransform":u"patternTransform",
|
|
"patternunits":u"patternUnits",
|
|
"pointsatx":u"pointsAtX",
|
|
"pointsaty":u"pointsAtY",
|
|
"pointsatz":u"pointsAtZ",
|
|
"preservealpha":u"preserveAlpha",
|
|
"preserveaspectratio":u"preserveAspectRatio",
|
|
"primitiveunits":u"primitiveUnits",
|
|
"refx":u"refX",
|
|
"refy":u"refY",
|
|
"repeatcount":u"repeatCount",
|
|
"repeatdur":u"repeatDur",
|
|
"requiredextensions":u"requiredExtensions",
|
|
"requiredfeatures":u"requiredFeatures",
|
|
"specularconstant":u"specularConstant",
|
|
"specularexponent":u"specularExponent",
|
|
"spreadmethod":u"spreadMethod",
|
|
"startoffset":u"startOffset",
|
|
"stddeviation":u"stdDeviation",
|
|
"stitchtiles":u"stitchTiles",
|
|
"surfacescale":u"surfaceScale",
|
|
"systemlanguage":u"systemLanguage",
|
|
"tablevalues":u"tableValues",
|
|
"targetx":u"targetX",
|
|
"targety":u"targetY",
|
|
"textlength":u"textLength",
|
|
"viewbox":u"viewBox",
|
|
"viewtarget":u"viewTarget",
|
|
"xchannelselector":u"xChannelSelector",
|
|
"ychannelselector":u"yChannelSelector",
|
|
"zoomandpan":u"zoomAndPan"
|
|
}
|
|
for originalName in token["data"].keys():
|
|
if originalName in replacements:
|
|
svgName = replacements[originalName]
|
|
token["data"][svgName] = token["data"][originalName]
|
|
del token["data"][originalName]
|
|
|
|
def adjustForeignAttributes(self, token):
|
|
replacements = {
|
|
"xlink:actuate":("xlink", "actuate", namespaces["xlink"]),
|
|
"xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]),
|
|
"xlink:href":("xlink", "href", namespaces["xlink"]),
|
|
"xlink:role":("xlink", "role", namespaces["xlink"]),
|
|
"xlink:show":("xlink", "show", namespaces["xlink"]),
|
|
"xlink:title":("xlink", "title", namespaces["xlink"]),
|
|
"xlink:type":("xlink", "type", namespaces["xlink"]),
|
|
"xml:base":("xml", "base", namespaces["xml"]),
|
|
"xml:lang":("xml", "lang", namespaces["xml"]),
|
|
"xml:space":("xml", "space", namespaces["xml"]),
|
|
"xmlns":(None, "xmlns", namespaces["xmlns"]),
|
|
"xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"])
|
|
}
|
|
|
|
for originalName in token["data"].iterkeys():
|
|
if originalName in replacements:
|
|
foreignName = replacements[originalName]
|
|
token["data"][foreignName] = token["data"][originalName]
|
|
del token["data"][originalName]
|
|
|
|
def reparseTokenNormal(self, token):
|
|
self.parser.phase()
|
|
|
|
def resetInsertionMode(self):
|
|
# The name of this method is mostly historical. (It's also used in the
|
|
# specification.)
|
|
last = False
|
|
newModes = {
|
|
"select":"inSelect",
|
|
"td":"inCell",
|
|
"th":"inCell",
|
|
"tr":"inRow",
|
|
"tbody":"inTableBody",
|
|
"thead":"inTableBody",
|
|
"tfoot":"inTableBody",
|
|
"caption":"inCaption",
|
|
"colgroup":"inColumnGroup",
|
|
"table":"inTable",
|
|
"head":"inBody",
|
|
"body":"inBody",
|
|
"frameset":"inFrameset",
|
|
"html":"beforeHead"
|
|
}
|
|
for node in self.tree.openElements[::-1]:
|
|
nodeName = node.name
|
|
new_phase = None
|
|
if node == self.tree.openElements[0]:
|
|
assert self.innerHTML
|
|
last = True
|
|
nodeName = self.innerHTML
|
|
# Check for conditions that should only happen in the innerHTML
|
|
# case
|
|
if nodeName in ("select", "colgroup", "head", "html"):
|
|
assert self.innerHTML
|
|
|
|
if not last and node.namespace != self.tree.defaultNamespace:
|
|
continue
|
|
|
|
if nodeName in newModes:
|
|
new_phase = self.phases[newModes[nodeName]]
|
|
break
|
|
elif last:
|
|
new_phase = self.phases["inBody"]
|
|
break
|
|
|
|
self.phase = new_phase
|
|
|
|
def parseRCDataRawtext(self, token, contentType):
|
|
"""Generic RCDATA/RAWTEXT Parsing algorithm
|
|
contentType - RCDATA or RAWTEXT
|
|
"""
|
|
assert contentType in ("RAWTEXT", "RCDATA")
|
|
|
|
element = self.tree.insertElement(token)
|
|
|
|
if contentType == "RAWTEXT":
|
|
self.tokenizer.state = self.tokenizer.rawtextState
|
|
else:
|
|
self.tokenizer.state = self.tokenizer.rcdataState
|
|
|
|
self.originalPhase = self.phase
|
|
|
|
self.phase = self.phases["text"]
|
|
|
|
def getPhases(debug):
|
|
def log(function):
|
|
"""Logger that records which phase processes each token"""
|
|
type_names = dict((value, key) for key, value in
|
|
constants.tokenTypes.iteritems())
|
|
def wrapped(self, *args, **kwargs):
|
|
if function.__name__.startswith("process") and len(args) > 0:
|
|
token = args[0]
|
|
try:
|
|
info = {"type":type_names[token['type']]}
|
|
except:
|
|
raise
|
|
if token['type'] in constants.tagTokenTypes:
|
|
info["name"] = token['name']
|
|
|
|
self.parser.log.append((self.parser.tokenizer.state.__name__,
|
|
self.parser.phase.__class__.__name__,
|
|
self.__class__.__name__,
|
|
function.__name__,
|
|
info))
|
|
return function(self, *args, **kwargs)
|
|
else:
|
|
return function(self, *args, **kwargs)
|
|
return wrapped
|
|
|
|
def getMetaclass(use_metaclass, metaclass_func):
|
|
if use_metaclass:
|
|
return method_decorator_metaclass(metaclass_func)
|
|
else:
|
|
return type
|
|
|
|
class Phase(object):
|
|
"""Base class for helper object that implements each phase of processing
|
|
"""
|
|
# Order should be (they can be omitted):
|
|
# * EOF
|
|
# * Comment
|
|
# * Doctype
|
|
# * SpaceCharacters
|
|
# * Characters
|
|
# * StartTag
|
|
# - startTag* methods
|
|
# * EndTag
|
|
# - endTag* methods
|
|
|
|
__metaclass__ = getMetaclass(debug, log)
|
|
|
|
def __init__(self, parser, tree):
|
|
self.parser = parser
|
|
self.tree = tree
|
|
|
|
def processEOF(self):
|
|
raise NotImplementedError
|
|
|
|
def processComment(self, token):
|
|
# For most phases the following is correct. Where it's not it will be
|
|
# overridden.
|
|
self.tree.insertComment(token, self.tree.openElements[-1])
|
|
|
|
def processDoctype(self, token):
|
|
self.parser.parseError("unexpected-doctype")
|
|
|
|
def processCharacters(self, token):
|
|
self.tree.insertText(token["data"])
|
|
|
|
def processSpaceCharacters(self, token):
|
|
self.tree.insertText(token["data"])
|
|
|
|
def processStartTag(self, token):
|
|
return self.startTagHandler[token["name"]](token)
|
|
|
|
def startTagHtml(self, token):
|
|
if self.parser.firstStartTag == False and token["name"] == "html":
|
|
self.parser.parseError("non-html-root")
|
|
# XXX Need a check here to see if the first start tag token emitted is
|
|
# this token... If it's not, invoke self.parser.parseError().
|
|
for attr, value in token["data"].iteritems():
|
|
if attr not in self.tree.openElements[0].attributes:
|
|
self.tree.openElements[0].attributes[attr] = value
|
|
self.parser.firstStartTag = False
|
|
|
|
def processEndTag(self, token):
|
|
return self.endTagHandler[token["name"]](token)
|
|
|
|
class InitialPhase(Phase):
|
|
def processSpaceCharacters(self, token):
|
|
pass
|
|
|
|
def processComment(self, token):
|
|
self.tree.insertComment(token, self.tree.document)
|
|
|
|
def processDoctype(self, token):
|
|
name = token["name"]
|
|
publicId = token["publicId"]
|
|
systemId = token["systemId"]
|
|
correct = token["correct"]
|
|
|
|
if (name != "html" or publicId != None or
|
|
systemId != None and systemId != "about:legacy-compat"):
|
|
self.parser.parseError("unknown-doctype")
|
|
|
|
if publicId is None:
|
|
publicId = ""
|
|
|
|
self.tree.insertDoctype(token)
|
|
|
|
if publicId != "":
|
|
publicId = publicId.translate(asciiUpper2Lower)
|
|
|
|
if (not correct or token["name"] != "html"
|
|
or startswithany(publicId,
|
|
("+//silmaril//dtd html pro v0r11 19970101//",
|
|
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
|
|
"-//as//dtd html 3.0 aswedit + extensions//",
|
|
"-//ietf//dtd html 2.0 level 1//",
|
|
"-//ietf//dtd html 2.0 level 2//",
|
|
"-//ietf//dtd html 2.0 strict level 1//",
|
|
"-//ietf//dtd html 2.0 strict level 2//",
|
|
"-//ietf//dtd html 2.0 strict//",
|
|
"-//ietf//dtd html 2.0//",
|
|
"-//ietf//dtd html 2.1e//",
|
|
"-//ietf//dtd html 3.0//",
|
|
"-//ietf//dtd html 3.2 final//",
|
|
"-//ietf//dtd html 3.2//",
|
|
"-//ietf//dtd html 3//",
|
|
"-//ietf//dtd html level 0//",
|
|
"-//ietf//dtd html level 1//",
|
|
"-//ietf//dtd html level 2//",
|
|
"-//ietf//dtd html level 3//",
|
|
"-//ietf//dtd html strict level 0//",
|
|
"-//ietf//dtd html strict level 1//",
|
|
"-//ietf//dtd html strict level 2//",
|
|
"-//ietf//dtd html strict level 3//",
|
|
"-//ietf//dtd html strict//",
|
|
"-//ietf//dtd html//",
|
|
"-//metrius//dtd metrius presentational//",
|
|
"-//microsoft//dtd internet explorer 2.0 html strict//",
|
|
"-//microsoft//dtd internet explorer 2.0 html//",
|
|
"-//microsoft//dtd internet explorer 2.0 tables//",
|
|
"-//microsoft//dtd internet explorer 3.0 html strict//",
|
|
"-//microsoft//dtd internet explorer 3.0 html//",
|
|
"-//microsoft//dtd internet explorer 3.0 tables//",
|
|
"-//netscape comm. corp.//dtd html//",
|
|
"-//netscape comm. corp.//dtd strict html//",
|
|
"-//o'reilly and associates//dtd html 2.0//",
|
|
"-//o'reilly and associates//dtd html extended 1.0//",
|
|
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
|
|
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
|
|
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
|
|
"-//spyglass//dtd html 2.0 extended//",
|
|
"-//sq//dtd html 2.0 hotmetal + extensions//",
|
|
"-//sun microsystems corp.//dtd hotjava html//",
|
|
"-//sun microsystems corp.//dtd hotjava strict html//",
|
|
"-//w3c//dtd html 3 1995-03-24//",
|
|
"-//w3c//dtd html 3.2 draft//",
|
|
"-//w3c//dtd html 3.2 final//",
|
|
"-//w3c//dtd html 3.2//",
|
|
"-//w3c//dtd html 3.2s draft//",
|
|
"-//w3c//dtd html 4.0 frameset//",
|
|
"-//w3c//dtd html 4.0 transitional//",
|
|
"-//w3c//dtd html experimental 19960712//",
|
|
"-//w3c//dtd html experimental 970421//",
|
|
"-//w3c//dtd w3 html//",
|
|
"-//w3o//dtd w3 html 3.0//",
|
|
"-//webtechs//dtd mozilla html 2.0//",
|
|
"-//webtechs//dtd mozilla html//"))
|
|
or publicId in
|
|
("-//w3o//dtd w3 html strict 3.0//en//",
|
|
"-/w3c/dtd html 4.0 transitional/en",
|
|
"html")
|
|
or startswithany(publicId,
|
|
("-//w3c//dtd html 4.01 frameset//",
|
|
"-//w3c//dtd html 4.01 transitional//")) and
|
|
systemId == None
|
|
or systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
|
|
self.parser.compatMode = "quirks"
|
|
elif (startswithany(publicId,
|
|
("-//w3c//dtd xhtml 1.0 frameset//",
|
|
"-//w3c//dtd xhtml 1.0 transitional//"))
|
|
or startswithany(publicId,
|
|
("-//w3c//dtd html 4.01 frameset//",
|
|
"-//w3c//dtd html 4.01 transitional//")) and
|
|
systemId != None):
|
|
self.parser.compatMode = "limited quirks"
|
|
|
|
self.parser.phase = self.parser.phases["beforeHtml"]
|
|
|
|
def anythingElse(self):
|
|
self.parser.compatMode = "quirks"
|
|
self.parser.phase = self.parser.phases["beforeHtml"]
|
|
|
|
def processCharacters(self, token):
|
|
self.parser.parseError("expected-doctype-but-got-chars")
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def processStartTag(self, token):
|
|
self.parser.parseError("expected-doctype-but-got-start-tag",
|
|
{"name": token["name"]})
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def processEndTag(self, token):
|
|
self.parser.parseError("expected-doctype-but-got-end-tag",
|
|
{"name": token["name"]})
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def processEOF(self):
|
|
self.parser.parseError("expected-doctype-but-got-eof")
|
|
self.anythingElse()
|
|
return True
|
|
|
|
|
|
class BeforeHtmlPhase(Phase):
|
|
# helper methods
|
|
def insertHtmlElement(self):
|
|
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
|
|
self.parser.phase = self.parser.phases["beforeHead"]
|
|
|
|
# other
|
|
def processEOF(self):
|
|
self.insertHtmlElement()
|
|
return True
|
|
|
|
def processComment(self, token):
|
|
self.tree.insertComment(token, self.tree.document)
|
|
|
|
def processSpaceCharacters(self, token):
|
|
pass
|
|
|
|
def processCharacters(self, token):
|
|
self.insertHtmlElement()
|
|
return token
|
|
|
|
def processStartTag(self, token):
|
|
if token["name"] == "html":
|
|
self.parser.firstStartTag = True
|
|
self.insertHtmlElement()
|
|
return token
|
|
|
|
def processEndTag(self, token):
|
|
if token["name"] not in ("head", "body", "html", "br"):
|
|
self.parser.parseError("unexpected-end-tag-before-html",
|
|
{"name": token["name"]})
|
|
else:
|
|
self.insertHtmlElement()
|
|
return token
|
|
|
|
|
|
class BeforeHeadPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("head", self.startTagHead)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
(("head", "body", "html", "br"), self.endTagImplyHead)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processEOF(self):
|
|
self.startTagHead(impliedTagToken("head", "StartTag"))
|
|
return True
|
|
|
|
def processSpaceCharacters(self, token):
|
|
pass
|
|
|
|
def processCharacters(self, token):
|
|
self.startTagHead(impliedTagToken("head", "StartTag"))
|
|
return token
|
|
|
|
def startTagHtml(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagHead(self, token):
|
|
self.tree.insertElement(token)
|
|
self.tree.headPointer = self.tree.openElements[-1]
|
|
self.parser.phase = self.parser.phases["inHead"]
|
|
|
|
def startTagOther(self, token):
|
|
self.startTagHead(impliedTagToken("head", "StartTag"))
|
|
return token
|
|
|
|
def endTagImplyHead(self, token):
|
|
self.startTagHead(impliedTagToken("head", "StartTag"))
|
|
return token
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("end-tag-after-implied-root",
|
|
{"name": token["name"]})
|
|
|
|
class InHeadPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("title", self.startTagTitle),
|
|
(("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
|
|
("script", self.startTagScript),
|
|
(("base", "basefont", "bgsound", "command", "link"),
|
|
self.startTagBaseLinkCommand),
|
|
("meta", self.startTagMeta),
|
|
("head", self.startTagHead)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self. endTagHandler = utils.MethodDispatcher([
|
|
("head", self.endTagHead),
|
|
(("br", "html", "body"), self.endTagHtmlBodyBr)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
# the real thing
|
|
def processEOF (self):
|
|
self.anythingElse()
|
|
return True
|
|
|
|
def processCharacters(self, token):
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def startTagHtml(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagHead(self, token):
|
|
self.parser.parseError("two-heads-are-not-better-than-one")
|
|
|
|
def startTagBaseLinkCommand(self, token):
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
|
|
def startTagMeta(self, token):
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
|
|
attributes = token["data"]
|
|
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
|
|
if "charset" in attributes:
|
|
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
|
|
elif "content" in attributes:
|
|
# Encoding it as UTF-8 here is a hack, as really we should pass
|
|
# the abstract Unicode string, and just use the
|
|
# ContentAttrParser on that, but using UTF-8 allows all chars
|
|
# to be encoded and as a ASCII-superset works.
|
|
data = inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
|
|
parser = inputstream.ContentAttrParser(data)
|
|
codec = parser.parse()
|
|
self.parser.tokenizer.stream.changeEncoding(codec)
|
|
|
|
def startTagTitle(self, token):
|
|
self.parser.parseRCDataRawtext(token, "RCDATA")
|
|
|
|
def startTagNoScriptNoFramesStyle(self, token):
|
|
#Need to decide whether to implement the scripting-disabled case
|
|
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
|
|
|
def startTagScript(self, token):
|
|
self.tree.insertElement(token)
|
|
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
|
|
self.parser.originalPhase = self.parser.phase
|
|
self.parser.phase = self.parser.phases["text"]
|
|
|
|
def startTagOther(self, token):
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def endTagHead(self, token):
|
|
node = self.parser.tree.openElements.pop()
|
|
assert node.name == "head", "Expected head got %s"%node.name
|
|
self.parser.phase = self.parser.phases["afterHead"]
|
|
|
|
def endTagHtmlBodyBr(self, token):
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
|
|
def anythingElse(self):
|
|
self.endTagHead(impliedTagToken("head"))
|
|
|
|
|
|
# XXX If we implement a parser for which scripting is disabled we need to
|
|
# implement this phase.
|
|
#
|
|
# class InHeadNoScriptPhase(Phase):
|
|
|
|
class AfterHeadPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("body", self.startTagBody),
|
|
("frameset", self.startTagFrameset),
|
|
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
|
|
"style", "title"),
|
|
self.startTagFromHead),
|
|
("head", self.startTagHead)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"),
|
|
self.endTagHtmlBodyBr)])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processEOF(self):
|
|
self.anythingElse()
|
|
return True
|
|
|
|
def processCharacters(self, token):
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def startTagHtml(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagBody(self, token):
|
|
self.parser.framesetOK = False
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
|
|
def startTagFrameset(self, token):
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inFrameset"]
|
|
|
|
def startTagFromHead(self, token):
|
|
self.parser.parseError("unexpected-start-tag-out-of-my-head",
|
|
{"name": token["name"]})
|
|
self.tree.openElements.append(self.tree.headPointer)
|
|
self.parser.phases["inHead"].processStartTag(token)
|
|
for node in self.tree.openElements[::-1]:
|
|
if node.name == "head":
|
|
self.tree.openElements.remove(node)
|
|
break
|
|
|
|
def startTagHead(self, token):
|
|
self.parser.parseError("unexpected-start-tag", {"name":token["name"]})
|
|
|
|
def startTagOther(self, token):
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def endTagHtmlBodyBr(self, token):
|
|
self.anythingElse()
|
|
return token
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag", {"name":token["name"]})
|
|
|
|
def anythingElse(self):
|
|
self.tree.insertElement(impliedTagToken("body", "StartTag"))
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
self.parser.framesetOK = True
|
|
|
|
|
|
class InBodyPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
|
|
# the really-really-really-very crazy mode
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
#Keep a ref to this for special handling of whitespace in <pre>
|
|
self.processSpaceCharactersNonPre = self.processSpaceCharacters
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
(("base", "basefont", "bgsound", "command", "link", "meta",
|
|
"noframes", "script", "style", "title"),
|
|
self.startTagProcessInHead),
|
|
("body", self.startTagBody),
|
|
("frameset", self.startTagFrameset),
|
|
(("address", "article", "aside", "blockquote", "center", "details",
|
|
"details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
|
|
"footer", "header", "hgroup", "menu", "nav", "ol", "p",
|
|
"section", "summary", "ul"),
|
|
self.startTagCloseP),
|
|
(headingElements, self.startTagHeading),
|
|
(("pre", "listing"), self.startTagPreListing),
|
|
("form", self.startTagForm),
|
|
(("li", "dd", "dt"), self.startTagListItem),
|
|
("plaintext",self.startTagPlaintext),
|
|
("a", self.startTagA),
|
|
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
|
|
"strong", "tt", "u"),self.startTagFormatting),
|
|
("nobr", self.startTagNobr),
|
|
("button", self.startTagButton),
|
|
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
|
|
("xmp", self.startTagXmp),
|
|
("table", self.startTagTable),
|
|
(("area", "br", "embed", "img", "keygen", "wbr"),
|
|
self.startTagVoidFormatting),
|
|
(("param", "source", "track"), self.startTagParamSource),
|
|
("input", self.startTagInput),
|
|
("hr", self.startTagHr),
|
|
("image", self.startTagImage),
|
|
("isindex", self.startTagIsIndex),
|
|
("textarea", self.startTagTextarea),
|
|
("iframe", self.startTagIFrame),
|
|
(("noembed", "noframes", "noscript"), self.startTagRawtext),
|
|
("select", self.startTagSelect),
|
|
(("rp", "rt"), self.startTagRpRt),
|
|
(("option", "optgroup"), self.startTagOpt),
|
|
(("math"), self.startTagMath),
|
|
(("svg"), self.startTagSvg),
|
|
(("caption", "col", "colgroup", "frame", "head",
|
|
"tbody", "td", "tfoot", "th", "thead",
|
|
"tr"), self.startTagMisplaced)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("body",self.endTagBody),
|
|
("html",self.endTagHtml),
|
|
(("address", "article", "aside", "blockquote", "center",
|
|
"details", "dir", "div", "dl", "fieldset", "figcaption", "figure",
|
|
"footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre",
|
|
"section", "summary", "ul"), self.endTagBlock),
|
|
("form", self.endTagForm),
|
|
("p",self.endTagP),
|
|
(("dd", "dt", "li"), self.endTagListItem),
|
|
(headingElements, self.endTagHeading),
|
|
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
|
|
"strike", "strong", "tt", "u"), self.endTagFormatting),
|
|
(("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
|
|
("br", self.endTagBr),
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def isMatchingFormattingElement(self, node1, node2):
|
|
if node1.name != node2.name or node1.namespace != node2.namespace:
|
|
return False
|
|
elif len(node1.attributes) != len(node2.attributes):
|
|
return False
|
|
else:
|
|
attributes1 = sorted(node1.attributes.items())
|
|
attributes2 = sorted(node2.attributes.items())
|
|
for attr1, attr2 in zip(attributes1, attributes2):
|
|
if attr1 != attr2:
|
|
return False
|
|
return True
|
|
|
|
# helper
|
|
def addFormattingElement(self, token):
|
|
self.tree.insertElement(token)
|
|
element = self.tree.openElements[-1]
|
|
|
|
matchingElements = []
|
|
for node in self.tree.activeFormattingElements[::-1]:
|
|
if node is Marker:
|
|
break
|
|
elif self.isMatchingFormattingElement(node, element):
|
|
matchingElements.append(node)
|
|
|
|
assert len(matchingElements) <= 3
|
|
if len(matchingElements) == 3:
|
|
self.tree.activeFormattingElements.remove(matchingElements[-1])
|
|
self.tree.activeFormattingElements.append(element)
|
|
|
|
# the real deal
|
|
def processEOF(self):
|
|
allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
|
|
"tfoot", "th", "thead", "tr", "body",
|
|
"html"))
|
|
for node in self.tree.openElements[::-1]:
|
|
if node.name not in allowed_elements:
|
|
self.parser.parseError("expected-closing-tag-but-got-eof")
|
|
break
|
|
#Stop parsing
|
|
|
|
def processSpaceCharactersDropNewline(self, token):
|
|
# Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
|
|
# want to drop leading newlines
|
|
data = token["data"]
|
|
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
|
if (data.startswith("\n") and
|
|
self.tree.openElements[-1].name in ("pre", "listing", "textarea")
|
|
and not self.tree.openElements[-1].hasContent()):
|
|
data = data[1:]
|
|
if data:
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertText(data)
|
|
|
|
def processCharacters(self, token):
|
|
if token["data"] == u"\u0000":
|
|
#The tokenizer should always emit null on its own
|
|
return
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertText(token["data"])
|
|
#This must be bad for performance
|
|
if (self.parser.framesetOK and
|
|
any([char not in spaceCharacters
|
|
for char in token["data"]])):
|
|
self.parser.framesetOK = False
|
|
|
|
def processSpaceCharacters(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertText(token["data"])
|
|
|
|
def startTagProcessInHead(self, token):
|
|
return self.parser.phases["inHead"].processStartTag(token)
|
|
|
|
def startTagBody(self, token):
|
|
self.parser.parseError("unexpected-start-tag", {"name": "body"})
|
|
if (len(self.tree.openElements) == 1
|
|
or self.tree.openElements[1].name != "body"):
|
|
assert self.parser.innerHTML
|
|
else:
|
|
self.parser.framesetOK = False
|
|
for attr, value in token["data"].iteritems():
|
|
if attr not in self.tree.openElements[1].attributes:
|
|
self.tree.openElements[1].attributes[attr] = value
|
|
|
|
def startTagFrameset(self, token):
|
|
self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
|
|
if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
|
|
assert self.parser.innerHTML
|
|
elif not self.parser.framesetOK:
|
|
pass
|
|
else:
|
|
if self.tree.openElements[1].parent:
|
|
self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
|
|
while self.tree.openElements[-1].name != "html":
|
|
self.tree.openElements.pop()
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inFrameset"]
|
|
|
|
def startTagCloseP(self, token):
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagPreListing(self, token):
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
self.tree.insertElement(token)
|
|
self.parser.framesetOK = False
|
|
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
|
|
|
def startTagForm(self, token):
|
|
if self.tree.formPointer:
|
|
self.parser.parseError(u"unexpected-start-tag", {"name": "form"})
|
|
else:
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
self.tree.insertElement(token)
|
|
self.tree.formPointer = self.tree.openElements[-1]
|
|
|
|
def startTagListItem(self, token):
|
|
self.parser.framesetOK = False
|
|
|
|
stopNamesMap = {"li":["li"],
|
|
"dt":["dt", "dd"],
|
|
"dd":["dt", "dd"]}
|
|
stopNames = stopNamesMap[token["name"]]
|
|
for node in reversed(self.tree.openElements):
|
|
if node.name in stopNames:
|
|
self.parser.phase.processEndTag(
|
|
impliedTagToken(node.name, "EndTag"))
|
|
break
|
|
if (node.nameTuple in specialElements and
|
|
node.name not in ("address", "div", "p")):
|
|
break
|
|
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.parser.phase.processEndTag(
|
|
impliedTagToken("p", "EndTag"))
|
|
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagPlaintext(self, token):
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
self.tree.insertElement(token)
|
|
self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
|
|
|
|
def startTagHeading(self, token):
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
if self.tree.openElements[-1].name in headingElements:
|
|
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
|
|
self.tree.openElements.pop()
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagA(self, token):
|
|
afeAElement = self.tree.elementInActiveFormattingElements("a")
|
|
if afeAElement:
|
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
|
{"startName": "a", "endName": "a"})
|
|
self.endTagFormatting(impliedTagToken("a"))
|
|
if afeAElement in self.tree.openElements:
|
|
self.tree.openElements.remove(afeAElement)
|
|
if afeAElement in self.tree.activeFormattingElements:
|
|
self.tree.activeFormattingElements.remove(afeAElement)
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.addFormattingElement(token)
|
|
|
|
def startTagFormatting(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.addFormattingElement(token)
|
|
|
|
def startTagNobr(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
if self.tree.elementInScope("nobr"):
|
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
|
{"startName": "nobr", "endName": "nobr"})
|
|
self.processEndTag(impliedTagToken("nobr"))
|
|
# XXX Need tests that trigger the following
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.addFormattingElement(token)
|
|
|
|
def startTagButton(self, token):
|
|
if self.tree.elementInScope("button"):
|
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
|
{"startName": "button", "endName": "button"})
|
|
self.processEndTag(impliedTagToken("button"))
|
|
return token
|
|
else:
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertElement(token)
|
|
self.parser.framesetOK = False
|
|
|
|
def startTagAppletMarqueeObject(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertElement(token)
|
|
self.tree.activeFormattingElements.append(Marker)
|
|
self.parser.framesetOK = False
|
|
|
|
def startTagXmp(self, token):
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.parser.framesetOK = False
|
|
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
|
|
|
def startTagTable(self, token):
|
|
if self.parser.compatMode != "quirks":
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.processEndTag(impliedTagToken("p"))
|
|
self.tree.insertElement(token)
|
|
self.parser.framesetOK = False
|
|
self.parser.phase = self.parser.phases["inTable"]
|
|
|
|
def startTagVoidFormatting(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
self.parser.framesetOK = False
|
|
|
|
def startTagInput(self, token):
|
|
framesetOK = self.parser.framesetOK
|
|
self.startTagVoidFormatting(token)
|
|
if ("type" in token["data"] and
|
|
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
|
|
#input type=hidden doesn't change framesetOK
|
|
self.parser.framesetOK = framesetOK
|
|
|
|
def startTagParamSource(self, token):
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
|
|
def startTagHr(self, token):
|
|
if self.tree.elementInScope("p", variant="button"):
|
|
self.endTagP(impliedTagToken("p"))
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
self.parser.framesetOK = False
|
|
|
|
def startTagImage(self, token):
|
|
# No really...
|
|
self.parser.parseError("unexpected-start-tag-treated-as",
|
|
{"originalName": "image", "newName": "img"})
|
|
self.processStartTag(impliedTagToken("img", "StartTag",
|
|
attributes=token["data"],
|
|
selfClosing=token["selfClosing"]))
|
|
|
|
def startTagIsIndex(self, token):
|
|
self.parser.parseError("deprecated-tag", {"name": "isindex"})
|
|
if self.tree.formPointer:
|
|
return
|
|
form_attrs = {}
|
|
if "action" in token["data"]:
|
|
form_attrs["action"] = token["data"]["action"]
|
|
self.processStartTag(impliedTagToken("form", "StartTag",
|
|
attributes=form_attrs))
|
|
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
|
self.processStartTag(impliedTagToken("label", "StartTag"))
|
|
# XXX Localization ...
|
|
if "prompt" in token["data"]:
|
|
prompt = token["data"]["prompt"]
|
|
else:
|
|
prompt = u"This is a searchable index. Enter search keywords: "
|
|
self.processCharacters(
|
|
{"type":tokenTypes["Characters"], "data":prompt})
|
|
attributes = token["data"].copy()
|
|
if "action" in attributes:
|
|
del attributes["action"]
|
|
if "prompt" in attributes:
|
|
del attributes["prompt"]
|
|
attributes["name"] = "isindex"
|
|
self.processStartTag(impliedTagToken("input", "StartTag",
|
|
attributes = attributes,
|
|
selfClosing =
|
|
token["selfClosing"]))
|
|
self.processEndTag(impliedTagToken("label"))
|
|
self.processStartTag(impliedTagToken("hr", "StartTag"))
|
|
self.processEndTag(impliedTagToken("form"))
|
|
|
|
def startTagTextarea(self, token):
|
|
self.tree.insertElement(token)
|
|
self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
|
|
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
|
|
self.parser.framesetOK = False
|
|
|
|
def startTagIFrame(self, token):
|
|
self.parser.framesetOK = False
|
|
self.startTagRawtext(token)
|
|
|
|
def startTagRawtext(self, token):
|
|
"""iframe, noembed noframes, noscript(if scripting enabled)"""
|
|
self.parser.parseRCDataRawtext(token, "RAWTEXT")
|
|
|
|
def startTagOpt(self, token):
|
|
if self.tree.openElements[-1].name == "option":
|
|
self.parser.phase.processEndTag(impliedTagToken("option"))
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.parser.tree.insertElement(token)
|
|
|
|
def startTagSelect(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertElement(token)
|
|
self.parser.framesetOK = False
|
|
if self.parser.phase in (self.parser.phases["inTable"],
|
|
self.parser.phases["inCaption"],
|
|
self.parser.phases["inColumnGroup"],
|
|
self.parser.phases["inTableBody"],
|
|
self.parser.phases["inRow"],
|
|
self.parser.phases["inCell"]):
|
|
self.parser.phase = self.parser.phases["inSelectInTable"]
|
|
else:
|
|
self.parser.phase = self.parser.phases["inSelect"]
|
|
|
|
def startTagRpRt(self, token):
|
|
if self.tree.elementInScope("ruby"):
|
|
self.tree.generateImpliedEndTags()
|
|
if self.tree.openElements[-1].name != "ruby":
|
|
self.parser.parseError()
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagMath(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.parser.adjustMathMLAttributes(token)
|
|
self.parser.adjustForeignAttributes(token)
|
|
token["namespace"] = namespaces["mathml"]
|
|
self.tree.insertElement(token)
|
|
#Need to get the parse error right for the case where the token
|
|
#has a namespace not equal to the xmlns attribute
|
|
if token["selfClosing"]:
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
|
|
def startTagSvg(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.parser.adjustSVGAttributes(token)
|
|
self.parser.adjustForeignAttributes(token)
|
|
token["namespace"] = namespaces["svg"]
|
|
self.tree.insertElement(token)
|
|
#Need to get the parse error right for the case where the token
|
|
#has a namespace not equal to the xmlns attribute
|
|
if token["selfClosing"]:
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
|
|
def startTagMisplaced(self, token):
|
|
""" Elements that should be children of other elements that have a
|
|
different insertion mode; here they are ignored
|
|
"caption", "col", "colgroup", "frame", "frameset", "head",
|
|
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
|
"tr", "noscript"
|
|
"""
|
|
self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
|
|
|
|
def startTagOther(self, token):
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertElement(token)
|
|
|
|
def endTagP(self, token):
|
|
if not self.tree.elementInScope("p", variant="button"):
|
|
self.startTagCloseP(impliedTagToken("p", "StartTag"))
|
|
self.parser.parseError("unexpected-end-tag", {"name": "p"})
|
|
self.endTagP(impliedTagToken("p", "EndTag"))
|
|
else:
|
|
self.tree.generateImpliedEndTags("p")
|
|
if self.tree.openElements[-1].name != "p":
|
|
self.parser.parseError("unexpected-end-tag", {"name": "p"})
|
|
node = self.tree.openElements.pop()
|
|
while node.name != "p":
|
|
node = self.tree.openElements.pop()
|
|
|
|
def endTagBody(self, token):
|
|
if not self.tree.elementInScope("body"):
|
|
self.parser.parseError()
|
|
return
|
|
elif self.tree.openElements[-1].name != "body":
|
|
for node in self.tree.openElements[2:]:
|
|
if node.name not in frozenset(("dd", "dt", "li", "optgroup",
|
|
"option", "p", "rp", "rt",
|
|
"tbody", "td", "tfoot",
|
|
"th", "thead", "tr", "body",
|
|
"html")):
|
|
#Not sure this is the correct name for the parse error
|
|
self.parser.parseError(
|
|
"expected-one-end-tag-but-got-another",
|
|
{"expectedName": "body", "gotName": node.name})
|
|
break
|
|
self.parser.phase = self.parser.phases["afterBody"]
|
|
|
|
def endTagHtml(self, token):
|
|
#We repeat the test for the body end tag token being ignored here
|
|
if self.tree.elementInScope("body"):
|
|
self.endTagBody(impliedTagToken("body"))
|
|
return token
|
|
|
|
def endTagBlock(self, token):
|
|
#Put us back in the right whitespace handling mode
|
|
if token["name"] == "pre":
|
|
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
|
inScope = self.tree.elementInScope(token["name"])
|
|
if inScope:
|
|
self.tree.generateImpliedEndTags()
|
|
if self.tree.openElements[-1].name != token["name"]:
|
|
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
|
|
if inScope:
|
|
node = self.tree.openElements.pop()
|
|
while node.name != token["name"]:
|
|
node = self.tree.openElements.pop()
|
|
|
|
def endTagForm(self, token):
|
|
node = self.tree.formPointer
|
|
self.tree.formPointer = None
|
|
if node is None or not self.tree.elementInScope(node):
|
|
self.parser.parseError("unexpected-end-tag",
|
|
{"name":"form"})
|
|
else:
|
|
self.tree.generateImpliedEndTags()
|
|
if self.tree.openElements[-1] != node:
|
|
self.parser.parseError("end-tag-too-early-ignored",
|
|
{"name": "form"})
|
|
self.tree.openElements.remove(node)
|
|
|
|
def endTagListItem(self, token):
|
|
if token["name"] == "li":
|
|
variant = "list"
|
|
else:
|
|
variant = None
|
|
if not self.tree.elementInScope(token["name"], variant=variant):
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
else:
|
|
self.tree.generateImpliedEndTags(exclude = token["name"])
|
|
if self.tree.openElements[-1].name != token["name"]:
|
|
self.parser.parseError(
|
|
"end-tag-too-early",
|
|
{"name": token["name"]})
|
|
node = self.tree.openElements.pop()
|
|
while node.name != token["name"]:
|
|
node = self.tree.openElements.pop()
|
|
|
|
def endTagHeading(self, token):
|
|
for item in headingElements:
|
|
if self.tree.elementInScope(item):
|
|
self.tree.generateImpliedEndTags()
|
|
break
|
|
if self.tree.openElements[-1].name != token["name"]:
|
|
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
|
|
|
|
for item in headingElements:
|
|
if self.tree.elementInScope(item):
|
|
item = self.tree.openElements.pop()
|
|
while item.name not in headingElements:
|
|
item = self.tree.openElements.pop()
|
|
break
|
|
|
|
def endTagFormatting(self, token):
|
|
"""The much-feared adoption agency algorithm"""
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
|
|
# XXX Better parseError messages appreciated.
|
|
name = token["name"]
|
|
|
|
outerLoopCounter = 0
|
|
while outerLoopCounter < 8:
|
|
outerLoopCounter += 1
|
|
|
|
# Step 1 paragraph 1
|
|
formattingElement = self.tree.elementInActiveFormattingElements(
|
|
token["name"])
|
|
if (not formattingElement or
|
|
(formattingElement in self.tree.openElements and
|
|
not self.tree.elementInScope(formattingElement.name))):
|
|
self.parser.parseError("adoption-agency-1.1", {"name": token["name"]})
|
|
return
|
|
|
|
# Step 1 paragraph 2
|
|
elif formattingElement not in self.tree.openElements:
|
|
self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
|
|
self.tree.activeFormattingElements.remove(formattingElement)
|
|
return
|
|
|
|
# Step 1 paragraph 3
|
|
if formattingElement != self.tree.openElements[-1]:
|
|
self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
|
|
|
|
# Step 2
|
|
# Start of the adoption agency algorithm proper
|
|
afeIndex = self.tree.openElements.index(formattingElement)
|
|
furthestBlock = None
|
|
for element in self.tree.openElements[afeIndex:]:
|
|
if element.nameTuple in specialElements:
|
|
furthestBlock = element
|
|
break
|
|
# Step 3
|
|
if furthestBlock is None:
|
|
element = self.tree.openElements.pop()
|
|
while element != formattingElement:
|
|
element = self.tree.openElements.pop()
|
|
self.tree.activeFormattingElements.remove(element)
|
|
return
|
|
commonAncestor = self.tree.openElements[afeIndex-1]
|
|
|
|
# Step 5
|
|
#if furthestBlock.parent:
|
|
# furthestBlock.parent.removeChild(furthestBlock)
|
|
|
|
# Step 5
|
|
# The bookmark is supposed to help us identify where to reinsert
|
|
# nodes in step 12. We have to ensure that we reinsert nodes after
|
|
# the node before the active formatting element. Note the bookmark
|
|
# can move in step 7.4
|
|
bookmark = self.tree.activeFormattingElements.index(formattingElement)
|
|
|
|
# Step 6
|
|
lastNode = node = furthestBlock
|
|
innerLoopCounter = 0
|
|
|
|
index = self.tree.openElements.index(node)
|
|
while innerLoopCounter < 3:
|
|
innerLoopCounter += 1
|
|
# Node is element before node in open elements
|
|
index -= 1
|
|
node = self.tree.openElements[index]
|
|
if node not in self.tree.activeFormattingElements:
|
|
self.tree.openElements.remove(node)
|
|
continue
|
|
# Step 6.3
|
|
if node == formattingElement:
|
|
break
|
|
# Step 6.4
|
|
if lastNode == furthestBlock:
|
|
bookmark = (self.tree.activeFormattingElements.index(node)
|
|
+ 1)
|
|
# Step 6.5
|
|
#cite = node.parent
|
|
clone = node.cloneNode()
|
|
# Replace node with clone
|
|
self.tree.activeFormattingElements[
|
|
self.tree.activeFormattingElements.index(node)] = clone
|
|
self.tree.openElements[
|
|
self.tree.openElements.index(node)] = clone
|
|
node = clone
|
|
|
|
# Step 6.6
|
|
# Remove lastNode from its parents, if any
|
|
if lastNode.parent:
|
|
lastNode.parent.removeChild(lastNode)
|
|
node.appendChild(lastNode)
|
|
# Step 7.7
|
|
lastNode = node
|
|
# End of inner loop
|
|
|
|
# Step 7
|
|
# Foster parent lastNode if commonAncestor is a
|
|
# table, tbody, tfoot, thead, or tr we need to foster parent the
|
|
# lastNode
|
|
if lastNode.parent:
|
|
lastNode.parent.removeChild(lastNode)
|
|
|
|
if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
|
|
parent, insertBefore = self.tree.getTableMisnestedNodePosition()
|
|
parent.insertBefore(lastNode, insertBefore)
|
|
else:
|
|
commonAncestor.appendChild(lastNode)
|
|
|
|
# Step 8
|
|
clone = formattingElement.cloneNode()
|
|
|
|
# Step 9
|
|
furthestBlock.reparentChildren(clone)
|
|
|
|
# Step 10
|
|
furthestBlock.appendChild(clone)
|
|
|
|
# Step 11
|
|
self.tree.activeFormattingElements.remove(formattingElement)
|
|
self.tree.activeFormattingElements.insert(bookmark, clone)
|
|
|
|
# Step 12
|
|
self.tree.openElements.remove(formattingElement)
|
|
self.tree.openElements.insert(
|
|
self.tree.openElements.index(furthestBlock) + 1, clone)
|
|
|
|
def endTagAppletMarqueeObject(self, token):
|
|
if self.tree.elementInScope(token["name"]):
|
|
self.tree.generateImpliedEndTags()
|
|
if self.tree.openElements[-1].name != token["name"]:
|
|
self.parser.parseError("end-tag-too-early", {"name": token["name"]})
|
|
|
|
if self.tree.elementInScope(token["name"]):
|
|
element = self.tree.openElements.pop()
|
|
while element.name != token["name"]:
|
|
element = self.tree.openElements.pop()
|
|
self.tree.clearActiveFormattingElements()
|
|
|
|
def endTagBr(self, token):
|
|
self.parser.parseError("unexpected-end-tag-treated-as",
|
|
{"originalName": "br", "newName": "br element"})
|
|
self.tree.reconstructActiveFormattingElements()
|
|
self.tree.insertElement(impliedTagToken("br", "StartTag"))
|
|
self.tree.openElements.pop()
|
|
|
|
def endTagOther(self, token):
|
|
for node in self.tree.openElements[::-1]:
|
|
if node.name == token["name"]:
|
|
self.tree.generateImpliedEndTags(exclude=token["name"])
|
|
if self.tree.openElements[-1].name != token["name"]:
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
while self.tree.openElements.pop() != node:
|
|
pass
|
|
break
|
|
else:
|
|
if node.nameTuple in specialElements:
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
break
|
|
|
|
class TextPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
self.startTagHandler = utils.MethodDispatcher([])
|
|
self.startTagHandler.default = self.startTagOther
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("script", self.endTagScript)])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processCharacters(self, token):
|
|
self.tree.insertText(token["data"])
|
|
|
|
def processEOF(self):
|
|
self.parser.parseError("expected-named-closing-tag-but-got-eof",
|
|
self.tree.openElements[-1].name)
|
|
self.tree.openElements.pop()
|
|
self.parser.phase = self.parser.originalPhase
|
|
return True
|
|
|
|
def startTagOther(self, token):
|
|
assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode"%token['name']
|
|
|
|
def endTagScript(self, token):
|
|
node = self.tree.openElements.pop()
|
|
assert node.name == "script"
|
|
self.parser.phase = self.parser.originalPhase
|
|
#The rest of this method is all stuff that only happens if
|
|
#document.write works
|
|
|
|
def endTagOther(self, token):
|
|
node = self.tree.openElements.pop()
|
|
self.parser.phase = self.parser.originalPhase
|
|
|
|
class InTablePhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("caption", self.startTagCaption),
|
|
("colgroup", self.startTagColgroup),
|
|
("col", self.startTagCol),
|
|
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
|
|
(("td", "th", "tr"), self.startTagImplyTbody),
|
|
("table", self.startTagTable),
|
|
(("style", "script"), self.startTagStyleScript),
|
|
("input", self.startTagInput),
|
|
("form", self.startTagForm)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("table", self.endTagTable),
|
|
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
|
|
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
# helper methods
|
|
def clearStackToTableContext(self):
|
|
# "clear the stack back to a table context"
|
|
while self.tree.openElements[-1].name not in ("table", "html"):
|
|
#self.parser.parseError("unexpected-implied-end-tag-in-table",
|
|
# {"name": self.tree.openElements[-1].name})
|
|
self.tree.openElements.pop()
|
|
# When the current node is <html> it's an innerHTML case
|
|
|
|
# processing methods
|
|
def processEOF(self):
|
|
if self.tree.openElements[-1].name != "html":
|
|
self.parser.parseError("eof-in-table")
|
|
else:
|
|
assert self.parser.innerHTML
|
|
#Stop parsing
|
|
|
|
def processSpaceCharacters(self, token):
|
|
originalPhase = self.parser.phase
|
|
self.parser.phase = self.parser.phases["inTableText"]
|
|
self.parser.phase.originalPhase = originalPhase
|
|
self.parser.phase.processSpaceCharacters(token)
|
|
|
|
def processCharacters(self, token):
|
|
originalPhase = self.parser.phase
|
|
self.parser.phase = self.parser.phases["inTableText"]
|
|
self.parser.phase.originalPhase = originalPhase
|
|
self.parser.phase.processCharacters(token)
|
|
|
|
def insertText(self, token):
|
|
#If we get here there must be at least one non-whitespace character
|
|
# Do the table magic!
|
|
self.tree.insertFromTable = True
|
|
self.parser.phases["inBody"].processCharacters(token)
|
|
self.tree.insertFromTable = False
|
|
|
|
def startTagCaption(self, token):
|
|
self.clearStackToTableContext()
|
|
self.tree.activeFormattingElements.append(Marker)
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inCaption"]
|
|
|
|
def startTagColgroup(self, token):
|
|
self.clearStackToTableContext()
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inColumnGroup"]
|
|
|
|
def startTagCol(self, token):
|
|
self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
|
|
return token
|
|
|
|
def startTagRowGroup(self, token):
|
|
self.clearStackToTableContext()
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inTableBody"]
|
|
|
|
def startTagImplyTbody(self, token):
|
|
self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
|
|
return token
|
|
|
|
def startTagTable(self, token):
|
|
self.parser.parseError("unexpected-start-tag-implies-end-tag",
|
|
{"startName": "table", "endName": "table"})
|
|
self.parser.phase.processEndTag(impliedTagToken("table"))
|
|
if not self.parser.innerHTML:
|
|
return token
|
|
|
|
def startTagStyleScript(self, token):
|
|
return self.parser.phases["inHead"].processStartTag(token)
|
|
|
|
def startTagInput(self, token):
|
|
if ("type" in token["data"] and
|
|
token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
|
|
self.parser.parseError("unexpected-hidden-input-in-table")
|
|
self.tree.insertElement(token)
|
|
# XXX associate with form
|
|
self.tree.openElements.pop()
|
|
else:
|
|
self.startTagOther(token)
|
|
|
|
def startTagForm(self, token):
|
|
self.parser.parseError("unexpected-form-in-table")
|
|
if self.tree.formPointer is None:
|
|
self.tree.insertElement(token)
|
|
self.tree.formPointer = self.tree.openElements[-1]
|
|
self.tree.openElements.pop()
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
|
|
# Do the table magic!
|
|
self.tree.insertFromTable = True
|
|
self.parser.phases["inBody"].processStartTag(token)
|
|
self.tree.insertFromTable = False
|
|
|
|
def endTagTable(self, token):
|
|
if self.tree.elementInScope("table", variant="table"):
|
|
self.tree.generateImpliedEndTags()
|
|
if self.tree.openElements[-1].name != "table":
|
|
self.parser.parseError("end-tag-too-early-named",
|
|
{"gotName": "table",
|
|
"expectedName": self.tree.openElements[-1].name})
|
|
while self.tree.openElements[-1].name != "table":
|
|
self.tree.openElements.pop()
|
|
self.tree.openElements.pop()
|
|
self.parser.resetInsertionMode()
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def endTagIgnore(self, token):
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
|
|
# Do the table magic!
|
|
self.tree.insertFromTable = True
|
|
self.parser.phases["inBody"].processEndTag(token)
|
|
self.tree.insertFromTable = False
|
|
|
|
class InTableTextPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
self.originalPhase = None
|
|
self.characterTokens = []
|
|
|
|
def flushCharacters(self):
|
|
data = "".join([item["data"] for item in self.characterTokens])
|
|
if any([item not in spaceCharacters for item in data]):
|
|
token = {"type":tokenTypes["Characters"], "data":data}
|
|
self.parser.phases["inTable"].insertText(token)
|
|
elif data:
|
|
self.tree.insertText(data)
|
|
self.characterTokens = []
|
|
|
|
def processComment(self, token):
|
|
self.flushCharacters()
|
|
self.parser.phase = self.originalPhase
|
|
return token
|
|
|
|
def processEOF(self):
|
|
self.flushCharacters()
|
|
self.parser.phase = self.originalPhase
|
|
return True
|
|
|
|
def processCharacters(self, token):
|
|
if token["data"] == u"\u0000":
|
|
return
|
|
self.characterTokens.append(token)
|
|
|
|
def processSpaceCharacters(self, token):
|
|
#pretty sure we should never reach here
|
|
self.characterTokens.append(token)
|
|
# assert False
|
|
|
|
def processStartTag(self, token):
|
|
self.flushCharacters()
|
|
self.parser.phase = self.originalPhase
|
|
return token
|
|
|
|
def processEndTag(self, token):
|
|
self.flushCharacters()
|
|
self.parser.phase = self.originalPhase
|
|
return token
|
|
|
|
|
|
class InCaptionPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
|
|
"thead", "tr"), self.startTagTableElement)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("caption", self.endTagCaption),
|
|
("table", self.endTagTable),
|
|
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
|
|
"thead", "tr"), self.endTagIgnore)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def ignoreEndTagCaption(self):
|
|
return not self.tree.elementInScope("caption", variant="table")
|
|
|
|
def processEOF(self):
|
|
self.parser.phases["inBody"].processEOF()
|
|
|
|
def processCharacters(self, token):
|
|
return self.parser.phases["inBody"].processCharacters(token)
|
|
|
|
def startTagTableElement(self, token):
|
|
self.parser.parseError()
|
|
#XXX Have to duplicate logic here to find out if the tag is ignored
|
|
ignoreEndTag = self.ignoreEndTagCaption()
|
|
self.parser.phase.processEndTag(impliedTagToken("caption"))
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
def startTagOther(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def endTagCaption(self, token):
|
|
if not self.ignoreEndTagCaption():
|
|
# AT this code is quite similar to endTagTable in "InTable"
|
|
self.tree.generateImpliedEndTags()
|
|
if self.tree.openElements[-1].name != "caption":
|
|
self.parser.parseError("expected-one-end-tag-but-got-another",
|
|
{"gotName": "caption",
|
|
"expectedName": self.tree.openElements[-1].name})
|
|
while self.tree.openElements[-1].name != "caption":
|
|
self.tree.openElements.pop()
|
|
self.tree.openElements.pop()
|
|
self.tree.clearActiveFormattingElements()
|
|
self.parser.phase = self.parser.phases["inTable"]
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def endTagTable(self, token):
|
|
self.parser.parseError()
|
|
ignoreEndTag = self.ignoreEndTagCaption()
|
|
self.parser.phase.processEndTag(impliedTagToken("caption"))
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
def endTagIgnore(self, token):
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
|
|
def endTagOther(self, token):
|
|
return self.parser.phases["inBody"].processEndTag(token)
|
|
|
|
|
|
class InColumnGroupPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
|
|
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("col", self.startTagCol)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("colgroup", self.endTagColgroup),
|
|
("col", self.endTagCol)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def ignoreEndTagColgroup(self):
|
|
return self.tree.openElements[-1].name == "html"
|
|
|
|
def processEOF(self):
|
|
if self.tree.openElements[-1].name == "html":
|
|
assert self.parser.innerHTML
|
|
return
|
|
else:
|
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
|
self.endTagColgroup(impliedTagToken("colgroup"))
|
|
if not ignoreEndTag:
|
|
return True
|
|
|
|
def processCharacters(self, token):
|
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
|
self.endTagColgroup(impliedTagToken("colgroup"))
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
def startTagCol(self, token):
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
|
|
def startTagOther(self, token):
|
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
|
self.endTagColgroup(impliedTagToken("colgroup"))
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
def endTagColgroup(self, token):
|
|
if self.ignoreEndTagColgroup():
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
else:
|
|
self.tree.openElements.pop()
|
|
self.parser.phase = self.parser.phases["inTable"]
|
|
|
|
def endTagCol(self, token):
|
|
self.parser.parseError("no-end-tag", {"name": "col"})
|
|
|
|
def endTagOther(self, token):
|
|
ignoreEndTag = self.ignoreEndTagColgroup()
|
|
self.endTagColgroup(impliedTagToken("colgroup"))
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
|
|
class InTableBodyPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("tr", self.startTagTr),
|
|
(("td", "th"), self.startTagTableCell),
|
|
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
|
|
self.startTagTableOther)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
|
|
("table", self.endTagTable),
|
|
(("body", "caption", "col", "colgroup", "html", "td", "th",
|
|
"tr"), self.endTagIgnore)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
# helper methods
|
|
def clearStackToTableBodyContext(self):
|
|
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
|
|
"thead", "html"):
|
|
#self.parser.parseError("unexpected-implied-end-tag-in-table",
|
|
# {"name": self.tree.openElements[-1].name})
|
|
self.tree.openElements.pop()
|
|
if self.tree.openElements[-1].name == "html":
|
|
assert self.parser.innerHTML
|
|
|
|
# the rest
|
|
def processEOF(self):
|
|
self.parser.phases["inTable"].processEOF()
|
|
|
|
def processSpaceCharacters(self, token):
|
|
return self.parser.phases["inTable"].processSpaceCharacters(token)
|
|
|
|
def processCharacters(self, token):
|
|
return self.parser.phases["inTable"].processCharacters(token)
|
|
|
|
def startTagTr(self, token):
|
|
self.clearStackToTableBodyContext()
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inRow"]
|
|
|
|
def startTagTableCell(self, token):
|
|
self.parser.parseError("unexpected-cell-in-table-body",
|
|
{"name": token["name"]})
|
|
self.startTagTr(impliedTagToken("tr", "StartTag"))
|
|
return token
|
|
|
|
def startTagTableOther(self, token):
|
|
# XXX AT Any ideas on how to share this with endTagTable?
|
|
if (self.tree.elementInScope("tbody", variant="table") or
|
|
self.tree.elementInScope("thead", variant="table") or
|
|
self.tree.elementInScope("tfoot", variant="table")):
|
|
self.clearStackToTableBodyContext()
|
|
self.endTagTableRowGroup(
|
|
impliedTagToken(self.tree.openElements[-1].name))
|
|
return token
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def startTagOther(self, token):
|
|
return self.parser.phases["inTable"].processStartTag(token)
|
|
|
|
def endTagTableRowGroup(self, token):
|
|
if self.tree.elementInScope(token["name"], variant="table"):
|
|
self.clearStackToTableBodyContext()
|
|
self.tree.openElements.pop()
|
|
self.parser.phase = self.parser.phases["inTable"]
|
|
else:
|
|
self.parser.parseError("unexpected-end-tag-in-table-body",
|
|
{"name": token["name"]})
|
|
|
|
def endTagTable(self, token):
|
|
if (self.tree.elementInScope("tbody", variant="table") or
|
|
self.tree.elementInScope("thead", variant="table") or
|
|
self.tree.elementInScope("tfoot", variant="table")):
|
|
self.clearStackToTableBodyContext()
|
|
self.endTagTableRowGroup(
|
|
impliedTagToken(self.tree.openElements[-1].name))
|
|
return token
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def endTagIgnore(self, token):
|
|
self.parser.parseError("unexpected-end-tag-in-table-body",
|
|
{"name": token["name"]})
|
|
|
|
def endTagOther(self, token):
|
|
return self.parser.phases["inTable"].processEndTag(token)
|
|
|
|
|
|
class InRowPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
(("td", "th"), self.startTagTableCell),
|
|
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
|
|
"tr"), self.startTagTableOther)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("tr", self.endTagTr),
|
|
("table", self.endTagTable),
|
|
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
|
|
(("body", "caption", "col", "colgroup", "html", "td", "th"),
|
|
self.endTagIgnore)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
# helper methods (XXX unify this with other table helper methods)
|
|
def clearStackToTableRowContext(self):
|
|
while self.tree.openElements[-1].name not in ("tr", "html"):
|
|
self.parser.parseError("unexpected-implied-end-tag-in-table-row",
|
|
{"name": self.tree.openElements[-1].name})
|
|
self.tree.openElements.pop()
|
|
|
|
def ignoreEndTagTr(self):
|
|
return not self.tree.elementInScope("tr", variant="table")
|
|
|
|
# the rest
|
|
def processEOF(self):
|
|
self.parser.phases["inTable"].processEOF()
|
|
|
|
def processSpaceCharacters(self, token):
|
|
return self.parser.phases["inTable"].processSpaceCharacters(token)
|
|
|
|
def processCharacters(self, token):
|
|
return self.parser.phases["inTable"].processCharacters(token)
|
|
|
|
def startTagTableCell(self, token):
|
|
self.clearStackToTableRowContext()
|
|
self.tree.insertElement(token)
|
|
self.parser.phase = self.parser.phases["inCell"]
|
|
self.tree.activeFormattingElements.append(Marker)
|
|
|
|
def startTagTableOther(self, token):
|
|
ignoreEndTag = self.ignoreEndTagTr()
|
|
self.endTagTr(impliedTagToken("tr"))
|
|
# XXX how are we sure it's always ignored in the innerHTML case?
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
def startTagOther(self, token):
|
|
return self.parser.phases["inTable"].processStartTag(token)
|
|
|
|
def endTagTr(self, token):
|
|
if not self.ignoreEndTagTr():
|
|
self.clearStackToTableRowContext()
|
|
self.tree.openElements.pop()
|
|
self.parser.phase = self.parser.phases["inTableBody"]
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def endTagTable(self, token):
|
|
ignoreEndTag = self.ignoreEndTagTr()
|
|
self.endTagTr(impliedTagToken("tr"))
|
|
# Reprocess the current tag if the tr end tag was not ignored
|
|
# XXX how are we sure it's always ignored in the innerHTML case?
|
|
if not ignoreEndTag:
|
|
return token
|
|
|
|
def endTagTableRowGroup(self, token):
|
|
if self.tree.elementInScope(token["name"], variant="table"):
|
|
self.endTagTr(impliedTagToken("tr"))
|
|
return token
|
|
else:
|
|
self.parser.parseError()
|
|
|
|
def endTagIgnore(self, token):
|
|
self.parser.parseError("unexpected-end-tag-in-table-row",
|
|
{"name": token["name"]})
|
|
|
|
def endTagOther(self, token):
|
|
return self.parser.phases["inTable"].processEndTag(token)
|
|
|
|
class InCellPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
|
|
"thead", "tr"), self.startTagTableOther)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
(("td", "th"), self.endTagTableCell),
|
|
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
|
|
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
# helper
|
|
def closeCell(self):
|
|
if self.tree.elementInScope("td", variant="table"):
|
|
self.endTagTableCell(impliedTagToken("td"))
|
|
elif self.tree.elementInScope("th", variant="table"):
|
|
self.endTagTableCell(impliedTagToken("th"))
|
|
|
|
# the rest
|
|
def processEOF(self):
|
|
self.parser.phases["inBody"].processEOF()
|
|
|
|
def processCharacters(self, token):
|
|
return self.parser.phases["inBody"].processCharacters(token)
|
|
|
|
def startTagTableOther(self, token):
|
|
if (self.tree.elementInScope("td", variant="table") or
|
|
self.tree.elementInScope("th", variant="table")):
|
|
self.closeCell()
|
|
return token
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def startTagOther(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def endTagTableCell(self, token):
|
|
if self.tree.elementInScope(token["name"], variant="table"):
|
|
self.tree.generateImpliedEndTags(token["name"])
|
|
if self.tree.openElements[-1].name != token["name"]:
|
|
self.parser.parseError("unexpected-cell-end-tag",
|
|
{"name": token["name"]})
|
|
while True:
|
|
node = self.tree.openElements.pop()
|
|
if node.name == token["name"]:
|
|
break
|
|
else:
|
|
self.tree.openElements.pop()
|
|
self.tree.clearActiveFormattingElements()
|
|
self.parser.phase = self.parser.phases["inRow"]
|
|
else:
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
|
|
def endTagIgnore(self, token):
|
|
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
|
|
|
|
def endTagImply(self, token):
|
|
if self.tree.elementInScope(token["name"], variant="table"):
|
|
self.closeCell()
|
|
return token
|
|
else:
|
|
# sometimes innerHTML case
|
|
self.parser.parseError()
|
|
|
|
def endTagOther(self, token):
|
|
return self.parser.phases["inBody"].processEndTag(token)
|
|
|
|
class InSelectPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("option", self.startTagOption),
|
|
("optgroup", self.startTagOptgroup),
|
|
("select", self.startTagSelect),
|
|
(("input", "keygen", "textarea"), self.startTagInput),
|
|
("script", self.startTagScript)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("option", self.endTagOption),
|
|
("optgroup", self.endTagOptgroup),
|
|
("select", self.endTagSelect)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
|
|
def processEOF(self):
|
|
if self.tree.openElements[-1].name != "html":
|
|
self.parser.parseError("eof-in-select")
|
|
else:
|
|
assert self.parser.innerHTML
|
|
|
|
def processCharacters(self, token):
|
|
if token["data"] == u"\u0000":
|
|
return
|
|
self.tree.insertText(token["data"])
|
|
|
|
def startTagOption(self, token):
|
|
# We need to imply </option> if <option> is the current node.
|
|
if self.tree.openElements[-1].name == "option":
|
|
self.tree.openElements.pop()
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagOptgroup(self, token):
|
|
if self.tree.openElements[-1].name == "option":
|
|
self.tree.openElements.pop()
|
|
if self.tree.openElements[-1].name == "optgroup":
|
|
self.tree.openElements.pop()
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagSelect(self, token):
|
|
self.parser.parseError("unexpected-select-in-select")
|
|
self.endTagSelect(impliedTagToken("select"))
|
|
|
|
def startTagInput(self, token):
|
|
self.parser.parseError("unexpected-input-in-select")
|
|
if self.tree.elementInScope("select", variant="select"):
|
|
self.endTagSelect(impliedTagToken("select"))
|
|
return token
|
|
else:
|
|
assert self.parser.innerHTML
|
|
|
|
def startTagScript(self, token):
|
|
return self.parser.phases["inHead"].processStartTag(token)
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("unexpected-start-tag-in-select",
|
|
{"name": token["name"]})
|
|
|
|
def endTagOption(self, token):
|
|
if self.tree.openElements[-1].name == "option":
|
|
self.tree.openElements.pop()
|
|
else:
|
|
self.parser.parseError("unexpected-end-tag-in-select",
|
|
{"name": "option"})
|
|
|
|
def endTagOptgroup(self, token):
|
|
# </optgroup> implicitly closes <option>
|
|
if (self.tree.openElements[-1].name == "option" and
|
|
self.tree.openElements[-2].name == "optgroup"):
|
|
self.tree.openElements.pop()
|
|
# It also closes </optgroup>
|
|
if self.tree.openElements[-1].name == "optgroup":
|
|
self.tree.openElements.pop()
|
|
# But nothing else
|
|
else:
|
|
self.parser.parseError("unexpected-end-tag-in-select",
|
|
{"name": "optgroup"})
|
|
|
|
def endTagSelect(self, token):
|
|
if self.tree.elementInScope("select", variant="select"):
|
|
node = self.tree.openElements.pop()
|
|
while node.name != "select":
|
|
node = self.tree.openElements.pop()
|
|
self.parser.resetInsertionMode()
|
|
else:
|
|
# innerHTML case
|
|
assert self.parser.innerHTML
|
|
self.parser.parseError()
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag-in-select",
|
|
{"name": token["name"]})
|
|
|
|
|
|
class InSelectInTablePhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
|
|
self.startTagTable)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
|
|
self.endTagTable)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processEOF(self):
|
|
self.parser.phases["inSelect"].processEOF()
|
|
|
|
def processCharacters(self, token):
|
|
return self.parser.phases["inSelect"].processCharacters(token)
|
|
|
|
def startTagTable(self, token):
|
|
self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
|
|
self.endTagOther(impliedTagToken("select"))
|
|
return token
|
|
|
|
def startTagOther(self, token):
|
|
return self.parser.phases["inSelect"].processStartTag(token)
|
|
|
|
def endTagTable(self, token):
|
|
self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
|
|
if self.tree.elementInScope(token["name"], variant="table"):
|
|
self.endTagOther(impliedTagToken("select"))
|
|
return token
|
|
|
|
def endTagOther(self, token):
|
|
return self.parser.phases["inSelect"].processEndTag(token)
|
|
|
|
|
|
class InForeignContentPhase(Phase):
|
|
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
|
|
"center", "code", "dd", "div", "dl", "dt",
|
|
"em", "embed", "h1", "h2", "h3",
|
|
"h4", "h5", "h6", "head", "hr", "i", "img",
|
|
"li", "listing", "menu", "meta", "nobr",
|
|
"ol", "p", "pre", "ruby", "s", "small",
|
|
"span", "strong", "strike", "sub", "sup",
|
|
"table", "tt", "u", "ul", "var"])
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
def adjustSVGTagNames(self, token):
|
|
replacements = {u"altglyph":u"altGlyph",
|
|
u"altglyphdef":u"altGlyphDef",
|
|
u"altglyphitem":u"altGlyphItem",
|
|
u"animatecolor":u"animateColor",
|
|
u"animatemotion":u"animateMotion",
|
|
u"animatetransform":u"animateTransform",
|
|
u"clippath":u"clipPath",
|
|
u"feblend":u"feBlend",
|
|
u"fecolormatrix":u"feColorMatrix",
|
|
u"fecomponenttransfer":u"feComponentTransfer",
|
|
u"fecomposite":u"feComposite",
|
|
u"feconvolvematrix":u"feConvolveMatrix",
|
|
u"fediffuselighting":u"feDiffuseLighting",
|
|
u"fedisplacementmap":u"feDisplacementMap",
|
|
u"fedistantlight":u"feDistantLight",
|
|
u"feflood":u"feFlood",
|
|
u"fefunca":u"feFuncA",
|
|
u"fefuncb":u"feFuncB",
|
|
u"fefuncg":u"feFuncG",
|
|
u"fefuncr":u"feFuncR",
|
|
u"fegaussianblur":u"feGaussianBlur",
|
|
u"feimage":u"feImage",
|
|
u"femerge":u"feMerge",
|
|
u"femergenode":u"feMergeNode",
|
|
u"femorphology":u"feMorphology",
|
|
u"feoffset":u"feOffset",
|
|
u"fepointlight":u"fePointLight",
|
|
u"fespecularlighting":u"feSpecularLighting",
|
|
u"fespotlight":u"feSpotLight",
|
|
u"fetile":u"feTile",
|
|
u"feturbulence":u"feTurbulence",
|
|
u"foreignobject":u"foreignObject",
|
|
u"glyphref":u"glyphRef",
|
|
u"lineargradient":u"linearGradient",
|
|
u"radialgradient":u"radialGradient",
|
|
u"textpath":u"textPath"}
|
|
|
|
if token["name"] in replacements:
|
|
token["name"] = replacements[token["name"]]
|
|
|
|
def processCharacters(self, token):
|
|
if token["data"] == u"\u0000":
|
|
token["data"] = u"\uFFFD"
|
|
elif (self.parser.framesetOK and
|
|
any(char not in spaceCharacters for char in token["data"])):
|
|
self.parser.framesetOK = False
|
|
Phase.processCharacters(self, token)
|
|
|
|
def processStartTag(self, token):
|
|
currentNode = self.tree.openElements[-1]
|
|
if (token["name"] in self.breakoutElements or
|
|
(token["name"] == "font" and
|
|
set(token["data"].keys()) & set(["color", "face", "size"]))):
|
|
self.parser.parseError("unexpected-html-element-in-foreign-content",
|
|
token["name"])
|
|
while (self.tree.openElements[-1].namespace !=
|
|
self.tree.defaultNamespace and
|
|
not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
|
|
not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
|
|
self.tree.openElements.pop()
|
|
return token
|
|
|
|
else:
|
|
if currentNode.namespace == namespaces["mathml"]:
|
|
self.parser.adjustMathMLAttributes(token)
|
|
elif currentNode.namespace == namespaces["svg"]:
|
|
self.adjustSVGTagNames(token)
|
|
self.parser.adjustSVGAttributes(token)
|
|
self.parser.adjustForeignAttributes(token)
|
|
token["namespace"] = currentNode.namespace
|
|
self.tree.insertElement(token)
|
|
if token["selfClosing"]:
|
|
self.tree.openElements.pop()
|
|
token["selfClosingAcknowledged"] = True
|
|
|
|
def processEndTag(self, token):
|
|
nodeIndex = len(self.tree.openElements) - 1
|
|
node = self.tree.openElements[-1]
|
|
if node.name != token["name"]:
|
|
self.parser.parseError("unexpected-end-tag", token["name"])
|
|
|
|
while True:
|
|
if node.name.translate(asciiUpper2Lower) == token["name"]:
|
|
#XXX this isn't in the spec but it seems necessary
|
|
if self.parser.phase == self.parser.phases["inTableText"]:
|
|
self.parser.phase.flushCharacters()
|
|
self.parser.phase = self.parser.phase.originalPhase
|
|
while self.tree.openElements.pop() != node:
|
|
assert self.tree.openElements
|
|
new_token = None
|
|
break
|
|
nodeIndex -= 1
|
|
|
|
node = self.tree.openElements[nodeIndex]
|
|
if node.namespace != self.tree.defaultNamespace:
|
|
continue
|
|
else:
|
|
new_token = self.parser.phase.processEndTag(token)
|
|
break
|
|
return new_token
|
|
|
|
|
|
class AfterBodyPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processEOF(self):
|
|
#Stop parsing
|
|
pass
|
|
|
|
def processComment(self, token):
|
|
# This is needed because data is to be appended to the <html> element
|
|
# here and not to whatever is currently open.
|
|
self.tree.insertComment(token, self.tree.openElements[0])
|
|
|
|
def processCharacters(self, token):
|
|
self.parser.parseError("unexpected-char-after-body")
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
return token
|
|
|
|
def startTagHtml(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("unexpected-start-tag-after-body",
|
|
{"name": token["name"]})
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
return token
|
|
|
|
def endTagHtml(self,name):
|
|
if self.parser.innerHTML:
|
|
self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
|
|
else:
|
|
self.parser.phase = self.parser.phases["afterAfterBody"]
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag-after-body",
|
|
{"name": token["name"]})
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
return token
|
|
|
|
class InFramesetPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("frameset", self.startTagFrameset),
|
|
("frame", self.startTagFrame),
|
|
("noframes", self.startTagNoframes)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("frameset", self.endTagFrameset)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processEOF(self):
|
|
if self.tree.openElements[-1].name != "html":
|
|
self.parser.parseError("eof-in-frameset")
|
|
else:
|
|
assert self.parser.innerHTML
|
|
|
|
def processCharacters(self, token):
|
|
self.parser.parseError("unexpected-char-in-frameset")
|
|
|
|
def startTagFrameset(self, token):
|
|
self.tree.insertElement(token)
|
|
|
|
def startTagFrame(self, token):
|
|
self.tree.insertElement(token)
|
|
self.tree.openElements.pop()
|
|
|
|
def startTagNoframes(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("unexpected-start-tag-in-frameset",
|
|
{"name": token["name"]})
|
|
|
|
def endTagFrameset(self, token):
|
|
if self.tree.openElements[-1].name == "html":
|
|
# innerHTML case
|
|
self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
|
|
else:
|
|
self.tree.openElements.pop()
|
|
if (not self.parser.innerHTML and
|
|
self.tree.openElements[-1].name != "frameset"):
|
|
# If we're not in innerHTML mode and the the current node is not a
|
|
# "frameset" element (anymore) then switch.
|
|
self.parser.phase = self.parser.phases["afterFrameset"]
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag-in-frameset",
|
|
{"name": token["name"]})
|
|
|
|
|
|
class AfterFramesetPhase(Phase):
|
|
# http://www.whatwg.org/specs/web-apps/current-work/#after3
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("noframes", self.startTagNoframes)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
self.endTagHandler = utils.MethodDispatcher([
|
|
("html", self.endTagHtml)
|
|
])
|
|
self.endTagHandler.default = self.endTagOther
|
|
|
|
def processEOF(self):
|
|
#Stop parsing
|
|
pass
|
|
|
|
def processCharacters(self, token):
|
|
self.parser.parseError("unexpected-char-after-frameset")
|
|
|
|
def startTagNoframes(self, token):
|
|
return self.parser.phases["inHead"].processStartTag(token)
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("unexpected-start-tag-after-frameset",
|
|
{"name": token["name"]})
|
|
|
|
def endTagHtml(self, token):
|
|
self.parser.phase = self.parser.phases["afterAfterFrameset"]
|
|
|
|
def endTagOther(self, token):
|
|
self.parser.parseError("unexpected-end-tag-after-frameset",
|
|
{"name": token["name"]})
|
|
|
|
|
|
class AfterAfterBodyPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
def processEOF(self):
|
|
pass
|
|
|
|
def processComment(self, token):
|
|
self.tree.insertComment(token, self.tree.document)
|
|
|
|
def processSpaceCharacters(self, token):
|
|
return self.parser.phases["inBody"].processSpaceCharacters(token)
|
|
|
|
def processCharacters(self, token):
|
|
self.parser.parseError("expected-eof-but-got-char")
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
return token
|
|
|
|
def startTagHtml(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("expected-eof-but-got-start-tag",
|
|
{"name": token["name"]})
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
return token
|
|
|
|
def processEndTag(self, token):
|
|
self.parser.parseError("expected-eof-but-got-end-tag",
|
|
{"name": token["name"]})
|
|
self.parser.phase = self.parser.phases["inBody"]
|
|
return token
|
|
|
|
class AfterAfterFramesetPhase(Phase):
|
|
def __init__(self, parser, tree):
|
|
Phase.__init__(self, parser, tree)
|
|
|
|
self.startTagHandler = utils.MethodDispatcher([
|
|
("html", self.startTagHtml),
|
|
("noframes", self.startTagNoFrames)
|
|
])
|
|
self.startTagHandler.default = self.startTagOther
|
|
|
|
def processEOF(self):
|
|
pass
|
|
|
|
def processComment(self, token):
|
|
self.tree.insertComment(token, self.tree.document)
|
|
|
|
def processSpaceCharacters(self, token):
|
|
return self.parser.phases["inBody"].processSpaceCharacters(token)
|
|
|
|
def processCharacters(self, token):
|
|
self.parser.parseError("expected-eof-but-got-char")
|
|
|
|
def startTagHtml(self, token):
|
|
return self.parser.phases["inBody"].processStartTag(token)
|
|
|
|
def startTagNoFrames(self, token):
|
|
return self.parser.phases["inHead"].processStartTag(token)
|
|
|
|
def startTagOther(self, token):
|
|
self.parser.parseError("expected-eof-but-got-start-tag",
|
|
{"name": token["name"]})
|
|
|
|
def processEndTag(self, token):
|
|
self.parser.parseError("expected-eof-but-got-end-tag",
|
|
{"name": token["name"]})
|
|
|
|
|
|
return {
|
|
"initial": InitialPhase,
|
|
"beforeHtml": BeforeHtmlPhase,
|
|
"beforeHead": BeforeHeadPhase,
|
|
"inHead": InHeadPhase,
|
|
# XXX "inHeadNoscript": InHeadNoScriptPhase,
|
|
"afterHead": AfterHeadPhase,
|
|
"inBody": InBodyPhase,
|
|
"text": TextPhase,
|
|
"inTable": InTablePhase,
|
|
"inTableText": InTableTextPhase,
|
|
"inCaption": InCaptionPhase,
|
|
"inColumnGroup": InColumnGroupPhase,
|
|
"inTableBody": InTableBodyPhase,
|
|
"inRow": InRowPhase,
|
|
"inCell": InCellPhase,
|
|
"inSelect": InSelectPhase,
|
|
"inSelectInTable": InSelectInTablePhase,
|
|
"inForeignContent": InForeignContentPhase,
|
|
"afterBody": AfterBodyPhase,
|
|
"inFrameset": InFramesetPhase,
|
|
"afterFrameset": AfterFramesetPhase,
|
|
"afterAfterBody": AfterAfterBodyPhase,
|
|
"afterAfterFrameset": AfterAfterFramesetPhase,
|
|
# XXX after after frameset
|
|
}
|
|
|
|
def impliedTagToken(name, type="EndTag", attributes = None,
|
|
selfClosing = False):
|
|
if attributes is None:
|
|
attributes = {}
|
|
return {"type":tokenTypes[type], "name":unicode(name), "data":attributes,
|
|
"selfClosing":selfClosing}
|
|
|
|
class ParseError(Exception):
|
|
"""Error in parsed document"""
|
|
pass
|