mirror of
https://github.com/moparisthebest/SickRage
synced 2024-12-14 20:12:19 -05:00
1745 lines
74 KiB
Python
1745 lines
74 KiB
Python
|
try:
|
||
|
frozenset
|
||
|
except NameError:
|
||
|
# Import from the sets module for python 2.3
|
||
|
from sets import Set as set
|
||
|
from sets import ImmutableSet as frozenset
|
||
|
try:
|
||
|
from collections import deque
|
||
|
except ImportError:
|
||
|
from utils import deque
|
||
|
|
||
|
from constants import spaceCharacters
|
||
|
from constants import entitiesWindows1252, entities
|
||
|
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
|
||
|
from constants import digits, hexDigits, EOF
|
||
|
from constants import tokenTypes, tagTokenTypes
|
||
|
from constants import replacementCharacters
|
||
|
|
||
|
from inputstream import HTMLInputStream
|
||
|
|
||
|
# Group entities by their first character, for faster lookups
|
||
|
entitiesByFirstChar = {}
|
||
|
for e in entities:
|
||
|
entitiesByFirstChar.setdefault(e[0], []).append(e)
|
||
|
|
||
|
class HTMLTokenizer(object):
|
||
|
""" This class takes care of tokenizing HTML.
|
||
|
|
||
|
* self.currentToken
|
||
|
Holds the token that is currently being processed.
|
||
|
|
||
|
* self.state
|
||
|
Holds a reference to the method to be invoked... XXX
|
||
|
|
||
|
* self.stream
|
||
|
Points to HTMLInputStream object.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
|
||
|
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
|
||
|
|
||
|
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
|
||
|
self.parser = parser
|
||
|
|
||
|
#Perform case conversions?
|
||
|
self.lowercaseElementName = lowercaseElementName
|
||
|
self.lowercaseAttrName = lowercaseAttrName
|
||
|
|
||
|
# Setup the initial tokenizer state
|
||
|
self.escapeFlag = False
|
||
|
self.lastFourChars = []
|
||
|
self.state = self.dataState
|
||
|
self.escape = False
|
||
|
|
||
|
# The current token being created
|
||
|
self.currentToken = None
|
||
|
super(HTMLTokenizer, self).__init__()
|
||
|
|
||
|
def __iter__(self):
|
||
|
""" This is where the magic happens.
|
||
|
|
||
|
We do our usually processing through the states and when we have a token
|
||
|
to return we yield the token which pauses processing until the next token
|
||
|
is requested.
|
||
|
"""
|
||
|
self.tokenQueue = deque([])
|
||
|
# Start processing. When EOF is reached self.state will return False
|
||
|
# instead of True and the loop will terminate.
|
||
|
while self.state():
|
||
|
while self.stream.errors:
|
||
|
yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
|
||
|
while self.tokenQueue:
|
||
|
yield self.tokenQueue.popleft()
|
||
|
|
||
|
def consumeNumberEntity(self, isHex):
|
||
|
"""This function returns either U+FFFD or the character based on the
|
||
|
decimal or hexadecimal representation. It also discards ";" if present.
|
||
|
If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
|
||
|
"""
|
||
|
|
||
|
allowed = digits
|
||
|
radix = 10
|
||
|
if isHex:
|
||
|
allowed = hexDigits
|
||
|
radix = 16
|
||
|
|
||
|
charStack = []
|
||
|
|
||
|
# Consume all the characters that are in range while making sure we
|
||
|
# don't hit an EOF.
|
||
|
c = self.stream.char()
|
||
|
while c in allowed and c is not EOF:
|
||
|
charStack.append(c)
|
||
|
c = self.stream.char()
|
||
|
|
||
|
# Convert the set of characters consumed to an int.
|
||
|
charAsInt = int("".join(charStack), radix)
|
||
|
|
||
|
# Certain characters get replaced with others
|
||
|
if charAsInt in replacementCharacters:
|
||
|
char = replacementCharacters[charAsInt]
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"illegal-codepoint-for-numeric-entity",
|
||
|
"datavars": {"charAsInt": charAsInt}})
|
||
|
elif ((0xD800 <= charAsInt <= 0xDFFF) or
|
||
|
(charAsInt > 0x10FFFF)):
|
||
|
char = u"\uFFFD"
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"illegal-codepoint-for-numeric-entity",
|
||
|
"datavars": {"charAsInt": charAsInt}})
|
||
|
else:
|
||
|
#Should speed up this check somehow (e.g. move the set to a constant)
|
||
|
if ((0x0001 <= charAsInt <= 0x0008) or
|
||
|
(0x000E <= charAsInt <= 0x001F) or
|
||
|
(0x007F <= charAsInt <= 0x009F) or
|
||
|
(0xFDD0 <= charAsInt <= 0xFDEF) or
|
||
|
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
|
||
|
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||
|
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
|
||
|
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
|
||
|
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
|
||
|
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
|
||
|
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||
|
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
|
||
|
0xFFFFF, 0x10FFFE, 0x10FFFF])):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data":
|
||
|
"illegal-codepoint-for-numeric-entity",
|
||
|
"datavars": {"charAsInt": charAsInt}})
|
||
|
try:
|
||
|
# Try/except needed as UCS-2 Python builds' unichar only works
|
||
|
# within the BMP.
|
||
|
char = unichr(charAsInt)
|
||
|
except ValueError:
|
||
|
char = eval("u'\\U%08x'" % charAsInt)
|
||
|
|
||
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||
|
# invoke parseError on parser.
|
||
|
if c != u";":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"numeric-entity-without-semicolon"})
|
||
|
self.stream.unget(c)
|
||
|
|
||
|
return char
|
||
|
|
||
|
def consumeEntity(self, allowedChar=None, fromAttribute=False):
|
||
|
# Initialise to the default output for when no entity is matched
|
||
|
output = u"&"
|
||
|
|
||
|
charStack = [self.stream.char()]
|
||
|
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&")
|
||
|
or (allowedChar is not None and allowedChar == charStack[0])):
|
||
|
self.stream.unget(charStack[0])
|
||
|
|
||
|
elif charStack[0] == u"#":
|
||
|
# Read the next character to see if it's hex or decimal
|
||
|
hex = False
|
||
|
charStack.append(self.stream.char())
|
||
|
if charStack[-1] in (u"x", u"X"):
|
||
|
hex = True
|
||
|
charStack.append(self.stream.char())
|
||
|
|
||
|
# charStack[-1] should be the first digit
|
||
|
if (hex and charStack[-1] in hexDigits) \
|
||
|
or (not hex and charStack[-1] in digits):
|
||
|
# At least one digit found, so consume the whole number
|
||
|
self.stream.unget(charStack[-1])
|
||
|
output = self.consumeNumberEntity(hex)
|
||
|
else:
|
||
|
# No digits found
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "expected-numeric-entity"})
|
||
|
self.stream.unget(charStack.pop())
|
||
|
output = u"&" + u"".join(charStack)
|
||
|
|
||
|
else:
|
||
|
# At this point in the process might have named entity. Entities
|
||
|
# are stored in the global variable "entities".
|
||
|
#
|
||
|
# Consume characters and compare to these to a substring of the
|
||
|
# entity names in the list until the substring no longer matches.
|
||
|
filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
|
||
|
|
||
|
def entitiesStartingWith(name):
|
||
|
return [e for e in filteredEntityList if e.startswith(name)]
|
||
|
|
||
|
while (charStack[-1] is not EOF and
|
||
|
entitiesStartingWith("".join(charStack))):
|
||
|
charStack.append(self.stream.char())
|
||
|
|
||
|
# At this point we have a string that starts with some characters
|
||
|
# that may match an entity
|
||
|
entityName = None
|
||
|
|
||
|
# Try to find the longest entity the string will match to take care
|
||
|
# of ¬i for instance.
|
||
|
for entityLength in xrange(len(charStack)-1, 1, -1):
|
||
|
possibleEntityName = "".join(charStack[:entityLength])
|
||
|
if possibleEntityName in entities:
|
||
|
entityName = possibleEntityName
|
||
|
break
|
||
|
|
||
|
if entityName is not None:
|
||
|
if entityName[-1] != ";":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"named-entity-without-semicolon"})
|
||
|
if (entityName[-1] != ";" and fromAttribute and
|
||
|
(charStack[entityLength] in asciiLetters or
|
||
|
charStack[entityLength] in digits or
|
||
|
charStack[entityLength] == "=")):
|
||
|
self.stream.unget(charStack.pop())
|
||
|
output = u"&" + u"".join(charStack)
|
||
|
else:
|
||
|
output = entities[entityName]
|
||
|
self.stream.unget(charStack.pop())
|
||
|
output += u"".join(charStack[entityLength:])
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-named-entity"})
|
||
|
self.stream.unget(charStack.pop())
|
||
|
output = u"&" + u"".join(charStack)
|
||
|
|
||
|
if fromAttribute:
|
||
|
self.currentToken["data"][-1][1] += output
|
||
|
else:
|
||
|
if output in spaceCharacters:
|
||
|
tokenType = "SpaceCharacters"
|
||
|
else:
|
||
|
tokenType = "Characters"
|
||
|
self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
|
||
|
|
||
|
def processEntityInAttribute(self, allowedChar):
|
||
|
"""This method replaces the need for "entityInAttributeValueState".
|
||
|
"""
|
||
|
self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
|
||
|
|
||
|
def emitCurrentToken(self):
|
||
|
"""This method is a generic handler for emitting the tags. It also sets
|
||
|
the state to "data" because that's what's needed after a token has been
|
||
|
emitted.
|
||
|
"""
|
||
|
token = self.currentToken
|
||
|
# Add token to the queue to be yielded
|
||
|
if (token["type"] in tagTokenTypes):
|
||
|
if self.lowercaseElementName:
|
||
|
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||
|
if token["type"] == tokenTypes["EndTag"]:
|
||
|
if token["data"]:
|
||
|
self.tokenQueue.append({"type":tokenTypes["ParseError"],
|
||
|
"data":"attributes-in-end-tag"})
|
||
|
if token["selfClosing"]:
|
||
|
self.tokenQueue.append({"type":tokenTypes["ParseError"],
|
||
|
"data":"self-closing-flag-on-end-tag"})
|
||
|
self.tokenQueue.append(token)
|
||
|
self.state = self.dataState
|
||
|
|
||
|
|
||
|
# Below are the various tokenizer states worked out.
|
||
|
|
||
|
def dataState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "&":
|
||
|
self.state = self.entityDataState
|
||
|
elif data == "<":
|
||
|
self.state = self.tagOpenState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data":"invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\u0000"})
|
||
|
elif data is EOF:
|
||
|
# Tokenization ends.
|
||
|
return False
|
||
|
elif data in spaceCharacters:
|
||
|
# Directly after emitting a token you switch back to the "data
|
||
|
# state". At that point spaceCharacters are important so they are
|
||
|
# emitted separately.
|
||
|
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
||
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
||
|
# No need to update lastFourChars here, since the first space will
|
||
|
# have already been appended to lastFourChars and will have broken
|
||
|
# any <!-- or --> sequences
|
||
|
else:
|
||
|
chars = self.stream.charsUntil((u"&", u"<", u"\u0000"))
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||
|
data + chars})
|
||
|
return True
|
||
|
|
||
|
def entityDataState(self):
|
||
|
self.consumeEntity()
|
||
|
self.state = self.dataState
|
||
|
return True
|
||
|
|
||
|
def rcdataState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "&":
|
||
|
self.state = self.characterReferenceInRcdata
|
||
|
elif data == "<":
|
||
|
self.state = self.rcdataLessThanSignState
|
||
|
elif data == EOF:
|
||
|
# Tokenization ends.
|
||
|
return False
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
elif data in spaceCharacters:
|
||
|
# Directly after emitting a token you switch back to the "data
|
||
|
# state". At that point spaceCharacters are important so they are
|
||
|
# emitted separately.
|
||
|
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
||
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
||
|
# No need to update lastFourChars here, since the first space will
|
||
|
# have already been appended to lastFourChars and will have broken
|
||
|
# any <!-- or --> sequences
|
||
|
else:
|
||
|
chars = self.stream.charsUntil((u"&", u"<"))
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||
|
data + chars})
|
||
|
return True
|
||
|
|
||
|
def characterReferenceInRcdata(self):
|
||
|
self.consumeEntity()
|
||
|
self.state = self.rcdataState
|
||
|
return True
|
||
|
|
||
|
def rawtextState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "<":
|
||
|
self.state = self.rawtextLessThanSignState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
elif data == EOF:
|
||
|
# Tokenization ends.
|
||
|
return False
|
||
|
else:
|
||
|
chars = self.stream.charsUntil((u"<", u"\u0000"))
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||
|
data + chars})
|
||
|
return True
|
||
|
|
||
|
def scriptDataState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "<":
|
||
|
self.state = self.scriptDataLessThanSignState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
elif data == EOF:
|
||
|
# Tokenization ends.
|
||
|
return False
|
||
|
else:
|
||
|
chars = self.stream.charsUntil((u"<", u"\u0000"))
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||
|
data + chars})
|
||
|
return True
|
||
|
|
||
|
def plaintextState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == EOF:
|
||
|
# Tokenization ends.
|
||
|
return False
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||
|
data + self.stream.charsUntil(u"\u0000")})
|
||
|
return True
|
||
|
|
||
|
def tagOpenState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == u"!":
|
||
|
self.state = self.markupDeclarationOpenState
|
||
|
elif data == u"/":
|
||
|
self.state = self.closeTagOpenState
|
||
|
elif data in asciiLetters:
|
||
|
self.currentToken = {"type": tokenTypes["StartTag"],
|
||
|
"name": data, "data": [],
|
||
|
"selfClosing": False,
|
||
|
"selfClosingAcknowledged": False}
|
||
|
self.state = self.tagNameState
|
||
|
elif data == u">":
|
||
|
# XXX In theory it could be something besides a tag name. But
|
||
|
# do we really care?
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-tag-name-but-got-right-bracket"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
|
||
|
self.state = self.dataState
|
||
|
elif data == u"?":
|
||
|
# XXX In theory it could be something besides a tag name. But
|
||
|
# do we really care?
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-tag-name-but-got-question-mark"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.bogusCommentState
|
||
|
else:
|
||
|
# XXX
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-tag-name"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.dataState
|
||
|
return True
|
||
|
|
||
|
def closeTagOpenState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in asciiLetters:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.tagNameState
|
||
|
elif data == u">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-closing-tag-but-got-right-bracket"})
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-closing-tag-but-got-eof"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
# XXX data can be _'_...
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-closing-tag-but-got-char",
|
||
|
"datavars": {"data": data}})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.bogusCommentState
|
||
|
return True
|
||
|
|
||
|
def tagNameState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == u">":
|
||
|
self.emitCurrentToken()
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-tag-name"})
|
||
|
self.state = self.dataState
|
||
|
elif data == u"/":
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["name"] += u"\uFFFD"
|
||
|
else:
|
||
|
self.currentToken["name"] += data
|
||
|
# (Don't use charsUntil here, because tag names are
|
||
|
# very short and it's faster to not do anything fancy)
|
||
|
return True
|
||
|
|
||
|
def rcdataLessThanSignState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "/":
|
||
|
self.temporaryBuffer = ""
|
||
|
self.state = self.rcdataEndTagOpenState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.rcdataState
|
||
|
return True
|
||
|
|
||
|
def rcdataEndTagOpenState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
self.state = self.rcdataEndTagNameState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.rcdataState
|
||
|
return True
|
||
|
|
||
|
def rcdataEndTagNameState(self):
|
||
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == "/" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == ">" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.emitCurrentToken()
|
||
|
self.state = self.dataState
|
||
|
elif data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"</" + self.temporaryBuffer})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.rcdataState
|
||
|
return True
|
||
|
|
||
|
def rawtextLessThanSignState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "/":
|
||
|
self.temporaryBuffer = ""
|
||
|
self.state = self.rawtextEndTagOpenState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.rawtextState
|
||
|
return True
|
||
|
|
||
|
def rawtextEndTagOpenState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
self.state = self.rawtextEndTagNameState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.rawtextState
|
||
|
return True
|
||
|
|
||
|
def rawtextEndTagNameState(self):
|
||
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == "/" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == ">" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.emitCurrentToken()
|
||
|
self.state = self.dataState
|
||
|
elif data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"</" + self.temporaryBuffer})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.rawtextState
|
||
|
return True
|
||
|
|
||
|
def scriptDataLessThanSignState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "/":
|
||
|
self.temporaryBuffer = ""
|
||
|
self.state = self.scriptDataEndTagOpenState
|
||
|
elif data == "!":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"})
|
||
|
self.state = self.scriptDataEscapeStartState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEndTagOpenState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
self.state = self.scriptDataEndTagNameState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEndTagNameState(self):
|
||
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == "/" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == ">" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.emitCurrentToken()
|
||
|
self.state = self.dataState
|
||
|
elif data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"</" + self.temporaryBuffer})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapeStartState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
self.state = self.scriptDataEscapeStartDashState
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapeStartDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
self.state = self.scriptDataEscapedDashDashState
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
self.state = self.scriptDataEscapedDashState
|
||
|
elif data == "<":
|
||
|
self.state = self.scriptDataEscapedLessThanSignState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
elif data == EOF:
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
chars = self.stream.charsUntil((u"<", u"-", u"\u0000"))
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||
|
data + chars})
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapedDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
self.state = self.scriptDataEscapedDashDashState
|
||
|
elif data == "<":
|
||
|
self.state = self.scriptDataEscapedLessThanSignState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
elif data == EOF:
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapedDashDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
elif data == "<":
|
||
|
self.state = self.scriptDataEscapedLessThanSignState
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
|
||
|
self.state = self.scriptDataState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
elif data == EOF:
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapedLessThanSignState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "/":
|
||
|
self.temporaryBuffer = ""
|
||
|
self.state = self.scriptDataEscapedEndTagOpenState
|
||
|
elif data in asciiLetters:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data})
|
||
|
self.temporaryBuffer = data
|
||
|
self.state = self.scriptDataDoubleEscapeStartState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapedEndTagOpenState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in asciiLetters:
|
||
|
self.temporaryBuffer = data
|
||
|
self.state = self.scriptDataEscapedEndTagNameState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataEscapedEndTagNameState(self):
|
||
|
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == "/" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == ">" and appropriate:
|
||
|
self.currentToken = {"type": tokenTypes["EndTag"],
|
||
|
"name": self.temporaryBuffer,
|
||
|
"data": [], "selfClosing":False}
|
||
|
self.emitCurrentToken()
|
||
|
self.state = self.dataState
|
||
|
elif data in asciiLetters:
|
||
|
self.temporaryBuffer += data
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"</" + self.temporaryBuffer})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataDoubleEscapeStartState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in (spaceCharacters | frozenset(("/", ">"))):
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
if self.temporaryBuffer.lower() == "script":
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
else:
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
elif data in asciiLetters:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
self.temporaryBuffer += data
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataDoubleEscapedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
self.state = self.scriptDataDoubleEscapedDashState
|
||
|
elif data == "<":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
elif data == EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-script-in-script"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
return True
|
||
|
|
||
|
def scriptDataDoubleEscapedDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
self.state = self.scriptDataDoubleEscapedDashDashState
|
||
|
elif data == "<":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
elif data == EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-script-in-script"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataDoubleEscapedDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||
|
elif data == "<":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||
|
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
|
||
|
self.state = self.scriptDataState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": u"\uFFFD"})
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
elif data == EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-script-in-script"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataDoubleEscapedLessThanSignState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "/":
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"})
|
||
|
self.temporaryBuffer = ""
|
||
|
self.state = self.scriptDataDoubleEscapeEndState
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
return True
|
||
|
|
||
|
def scriptDataDoubleEscapeEndState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in (spaceCharacters | frozenset(("/", ">"))):
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
if self.temporaryBuffer.lower() == "script":
|
||
|
self.state = self.scriptDataEscapedState
|
||
|
else:
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
elif data in asciiLetters:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||
|
self.temporaryBuffer += data
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.scriptDataDoubleEscapedState
|
||
|
return True
|
||
|
|
||
|
def beforeAttributeNameState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.stream.charsUntil(spaceCharacters, True)
|
||
|
elif data in asciiLetters:
|
||
|
self.currentToken["data"].append([data, ""])
|
||
|
self.state = self.attributeNameState
|
||
|
elif data == u">":
|
||
|
self.emitCurrentToken()
|
||
|
elif data == u"/":
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data in (u"'", u'"', u"=", u"<"):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"invalid-character-in-attribute-name"})
|
||
|
self.currentToken["data"].append([data, ""])
|
||
|
self.state = self.attributeNameState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"].append([u"\uFFFD", ""])
|
||
|
self.state = self.attributeNameState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-attribute-name-but-got-eof"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"].append([data, ""])
|
||
|
self.state = self.attributeNameState
|
||
|
return True
|
||
|
|
||
|
def attributeNameState(self):
|
||
|
data = self.stream.char()
|
||
|
leavingThisState = True
|
||
|
emitToken = False
|
||
|
if data == u"=":
|
||
|
self.state = self.beforeAttributeValueState
|
||
|
elif data in asciiLetters:
|
||
|
self.currentToken["data"][-1][0] += data +\
|
||
|
self.stream.charsUntil(asciiLetters, True)
|
||
|
leavingThisState = False
|
||
|
elif data == u">":
|
||
|
# XXX If we emit here the attributes are converted to a dict
|
||
|
# without being checked and when the code below runs we error
|
||
|
# because data is a dict not a list
|
||
|
emitToken = True
|
||
|
elif data in spaceCharacters:
|
||
|
self.state = self.afterAttributeNameState
|
||
|
elif data == u"/":
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"][-1][0] += u"\uFFFD"
|
||
|
leavingThisState = False
|
||
|
elif data in (u"'", u'"', u"<"):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data":
|
||
|
"invalid-character-in-attribute-name"})
|
||
|
self.currentToken["data"][-1][0] += data
|
||
|
leavingThisState = False
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "eof-in-attribute-name"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"][-1][0] += data
|
||
|
leavingThisState = False
|
||
|
|
||
|
if leavingThisState:
|
||
|
# Attributes are not dropped at this stage. That happens when the
|
||
|
# start tag token is emitted so values can still be safely appended
|
||
|
# to attributes, but we do want to report the parse error in time.
|
||
|
if self.lowercaseAttrName:
|
||
|
self.currentToken["data"][-1][0] = (
|
||
|
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||
|
for name, value in self.currentToken["data"][:-1]:
|
||
|
if self.currentToken["data"][-1][0] == name:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"duplicate-attribute"})
|
||
|
break
|
||
|
# XXX Fix for above XXX
|
||
|
if emitToken:
|
||
|
self.emitCurrentToken()
|
||
|
return True
|
||
|
|
||
|
def afterAttributeNameState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.stream.charsUntil(spaceCharacters, True)
|
||
|
elif data == u"=":
|
||
|
self.state = self.beforeAttributeValueState
|
||
|
elif data == u">":
|
||
|
self.emitCurrentToken()
|
||
|
elif data in asciiLetters:
|
||
|
self.currentToken["data"].append([data, ""])
|
||
|
self.state = self.attributeNameState
|
||
|
elif data == u"/":
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"].append([u"\uFFFD", ""])
|
||
|
self.state = self.attributeNameState
|
||
|
elif data in (u"'", u'"', u"<"):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"invalid-character-after-attribute-name"})
|
||
|
self.currentToken["data"].append([data, ""])
|
||
|
self.state = self.attributeNameState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-end-of-tag-but-got-eof"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"].append([data, ""])
|
||
|
self.state = self.attributeNameState
|
||
|
return True
|
||
|
|
||
|
def beforeAttributeValueState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.stream.charsUntil(spaceCharacters, True)
|
||
|
elif data == u"\"":
|
||
|
self.state = self.attributeValueDoubleQuotedState
|
||
|
elif data == u"&":
|
||
|
self.state = self.attributeValueUnQuotedState
|
||
|
self.stream.unget(data);
|
||
|
elif data == u"'":
|
||
|
self.state = self.attributeValueSingleQuotedState
|
||
|
elif data == u">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-attribute-value-but-got-right-bracket"})
|
||
|
self.emitCurrentToken()
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"][-1][1] += u"\uFFFD"
|
||
|
self.state = self.attributeValueUnQuotedState
|
||
|
elif data in (u"=", u"<", u"`"):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"equals-in-unquoted-attribute-value"})
|
||
|
self.currentToken["data"][-1][1] += data
|
||
|
self.state = self.attributeValueUnQuotedState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-attribute-value-but-got-eof"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"][-1][1] += data
|
||
|
self.state = self.attributeValueUnQuotedState
|
||
|
return True
|
||
|
|
||
|
def attributeValueDoubleQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "\"":
|
||
|
self.state = self.afterAttributeValueState
|
||
|
elif data == u"&":
|
||
|
self.processEntityInAttribute(u'"')
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"][-1][1] += u"\uFFFD"
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-attribute-value-double-quote"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"][-1][1] += data +\
|
||
|
self.stream.charsUntil(("\"", u"&"))
|
||
|
return True
|
||
|
|
||
|
def attributeValueSingleQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "'":
|
||
|
self.state = self.afterAttributeValueState
|
||
|
elif data == u"&":
|
||
|
self.processEntityInAttribute(u"'")
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"][-1][1] += u"\uFFFD"
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-attribute-value-single-quote"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"][-1][1] += data +\
|
||
|
self.stream.charsUntil(("'", u"&"))
|
||
|
return True
|
||
|
|
||
|
def attributeValueUnQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == u"&":
|
||
|
self.processEntityInAttribute(">")
|
||
|
elif data == u">":
|
||
|
self.emitCurrentToken()
|
||
|
elif data in (u'"', u"'", u"=", u"<", u"`"):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-character-in-unquoted-attribute-value"})
|
||
|
self.currentToken["data"][-1][1] += data
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"][-1][1] += u"\uFFFD"
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-attribute-value-no-quotes"})
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
|
||
|
frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters)
|
||
|
return True
|
||
|
|
||
|
def afterAttributeValueState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
elif data == u">":
|
||
|
self.emitCurrentToken()
|
||
|
elif data == u"/":
|
||
|
self.state = self.selfClosingStartTagState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-EOF-after-attribute-value"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-character-after-attribute-value"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
return True
|
||
|
|
||
|
def selfClosingStartTagState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == ">":
|
||
|
self.currentToken["selfClosing"] = True
|
||
|
self.emitCurrentToken()
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data":
|
||
|
"unexpected-EOF-after-solidus-in-tag"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-character-after-soldius-in-tag"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeAttributeNameState
|
||
|
return True
|
||
|
|
||
|
def bogusCommentState(self):
|
||
|
# Make a new comment token and give it as value all the characters
|
||
|
# until the first > or EOF (charsUntil checks for EOF automatically)
|
||
|
# and emit it.
|
||
|
data = self.stream.charsUntil(u">")
|
||
|
data = data.replace(u"\u0000", u"\uFFFD")
|
||
|
self.tokenQueue.append(
|
||
|
{"type": tokenTypes["Comment"], "data": data})
|
||
|
|
||
|
# Eat the character directly after the bogus comment which is either a
|
||
|
# ">" or an EOF.
|
||
|
self.stream.char()
|
||
|
self.state = self.dataState
|
||
|
return True
|
||
|
|
||
|
def markupDeclarationOpenState(self):
|
||
|
charStack = [self.stream.char()]
|
||
|
if charStack[-1] == u"-":
|
||
|
charStack.append(self.stream.char())
|
||
|
if charStack[-1] == u"-":
|
||
|
self.currentToken = {"type": tokenTypes["Comment"], "data": u""}
|
||
|
self.state = self.commentStartState
|
||
|
return True
|
||
|
elif charStack[-1] in (u'd', u'D'):
|
||
|
matched = True
|
||
|
for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'),
|
||
|
(u'y', u'Y'), (u'p', u'P'), (u'e', u'E')):
|
||
|
charStack.append(self.stream.char())
|
||
|
if charStack[-1] not in expected:
|
||
|
matched = False
|
||
|
break
|
||
|
if matched:
|
||
|
self.currentToken = {"type": tokenTypes["Doctype"],
|
||
|
"name": u"",
|
||
|
"publicId": None, "systemId": None,
|
||
|
"correct": True}
|
||
|
self.state = self.doctypeState
|
||
|
return True
|
||
|
elif (charStack[-1] == "[" and
|
||
|
self.parser is not None and
|
||
|
self.parser.tree.openElements and
|
||
|
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
|
||
|
matched = True
|
||
|
for expected in ["C", "D", "A", "T", "A", "["]:
|
||
|
charStack.append(self.stream.char())
|
||
|
if charStack[-1] != expected:
|
||
|
matched = False
|
||
|
break
|
||
|
if matched:
|
||
|
self.state = self.cdataSectionState
|
||
|
return True
|
||
|
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-dashes-or-doctype"})
|
||
|
|
||
|
while charStack:
|
||
|
self.stream.unget(charStack.pop())
|
||
|
self.state = self.bogusCommentState
|
||
|
return True
|
||
|
|
||
|
def commentStartState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.state = self.commentStartDashState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"] += u"\uFFFD"
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"incorrect-comment"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-comment"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"] += data
|
||
|
self.state = self.commentState
|
||
|
return True
|
||
|
|
||
|
def commentStartDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "-":
|
||
|
self.state = self.commentEndState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"] += u"-\uFFFD"
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"incorrect-comment"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-comment"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"] += "-" + data
|
||
|
self.state = self.commentState
|
||
|
return True
|
||
|
|
||
|
|
||
|
def commentState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == u"-":
|
||
|
self.state = self.commentEndDashState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"] += u"\uFFFD"
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "eof-in-comment"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"] += data + \
|
||
|
self.stream.charsUntil((u"-", u"\u0000"))
|
||
|
return True
|
||
|
|
||
|
def commentEndDashState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == u"-":
|
||
|
self.state = self.commentEndState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"] += u"-\uFFFD"
|
||
|
self.state = self.commentState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-comment-end-dash"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"] += u"-" + data
|
||
|
self.state = self.commentState
|
||
|
return True
|
||
|
|
||
|
def commentEndState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == u">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"] += u"--\uFFFD"
|
||
|
self.state = self.commentState
|
||
|
elif data == "!":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-bang-after-double-dash-in-comment"})
|
||
|
self.state = self.commentEndBangState
|
||
|
elif data == u"-":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-dash-after-double-dash-in-comment"})
|
||
|
self.currentToken["data"] += data
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-comment-double-dash"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
# XXX
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-comment"})
|
||
|
self.currentToken["data"] += u"--" + data
|
||
|
self.state = self.commentState
|
||
|
return True
|
||
|
|
||
|
def commentEndBangState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == u">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data == u"-":
|
||
|
self.currentToken["data"] += "--!"
|
||
|
self.state = self.commentEndDashState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["data"] += u"--!\uFFFD"
|
||
|
self.state = self.commentState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-comment-end-bang-state"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["data"] += u"--!" + data
|
||
|
self.state = self.commentState
|
||
|
return True
|
||
|
|
||
|
def doctypeState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.beforeDoctypeNameState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-doctype-name-but-got-eof"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"need-space-after-doctype"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeDoctypeNameState
|
||
|
return True
|
||
|
|
||
|
def beforeDoctypeNameState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
pass
|
||
|
elif data == u">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-doctype-name-but-got-right-bracket"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["name"] = u"\uFFFD"
|
||
|
self.state = self.doctypeNameState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-doctype-name-but-got-eof"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["name"] = data
|
||
|
self.state = self.doctypeNameState
|
||
|
return True
|
||
|
|
||
|
def doctypeNameState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
||
|
self.state = self.afterDoctypeNameState
|
||
|
elif data == u">":
|
||
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["name"] += u"\uFFFD"
|
||
|
self.state = self.doctypeNameState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype-name"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["name"] += data
|
||
|
return True
|
||
|
|
||
|
def afterDoctypeNameState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
pass
|
||
|
elif data == u">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.currentToken["correct"] = False
|
||
|
self.stream.unget(data)
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
if data in (u"p", u"P"):
|
||
|
matched = True
|
||
|
for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"),
|
||
|
(u"i", u"I"), (u"c", u"C")):
|
||
|
data = self.stream.char()
|
||
|
if data not in expected:
|
||
|
matched = False
|
||
|
break
|
||
|
if matched:
|
||
|
self.state = self.afterDoctypePublicKeywordState
|
||
|
return True
|
||
|
elif data in (u"s", u"S"):
|
||
|
matched = True
|
||
|
for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"),
|
||
|
(u"e", u"E"), (u"m", u"M")):
|
||
|
data = self.stream.char()
|
||
|
if data not in expected:
|
||
|
matched = False
|
||
|
break
|
||
|
if matched:
|
||
|
self.state = self.afterDoctypeSystemKeywordState
|
||
|
return True
|
||
|
|
||
|
# All the characters read before the current 'data' will be
|
||
|
# [a-zA-Z], so they're garbage in the bogus doctype and can be
|
||
|
# discarded; only the latest character might be '>' or EOF
|
||
|
# and needs to be ungetted
|
||
|
self.stream.unget(data)
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"expected-space-or-right-bracket-in-doctype", "datavars":
|
||
|
{"data": data}})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.state = self.bogusDoctypeState
|
||
|
|
||
|
return True
|
||
|
|
||
|
def afterDoctypePublicKeywordState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.beforeDoctypePublicIdentifierState
|
||
|
elif data in ("'", '"'):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeDoctypePublicIdentifierState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeDoctypePublicIdentifierState
|
||
|
return True
|
||
|
|
||
|
def beforeDoctypePublicIdentifierState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
pass
|
||
|
elif data == "\"":
|
||
|
self.currentToken["publicId"] = u""
|
||
|
self.state = self.doctypePublicIdentifierDoubleQuotedState
|
||
|
elif data == "'":
|
||
|
self.currentToken["publicId"] = u""
|
||
|
self.state = self.doctypePublicIdentifierSingleQuotedState
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-end-of-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.state = self.bogusDoctypeState
|
||
|
return True
|
||
|
|
||
|
def doctypePublicIdentifierDoubleQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "\"":
|
||
|
self.state = self.afterDoctypePublicIdentifierState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["publicId"] += u"\uFFFD"
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-end-of-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["publicId"] += data
|
||
|
return True
|
||
|
|
||
|
def doctypePublicIdentifierSingleQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "'":
|
||
|
self.state = self.afterDoctypePublicIdentifierState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["publicId"] += u"\uFFFD"
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-end-of-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["publicId"] += data
|
||
|
return True
|
||
|
|
||
|
def afterDoctypePublicIdentifierState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data == '"':
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["systemId"] = u""
|
||
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
||
|
elif data == "'":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["systemId"] = u""
|
||
|
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.state = self.bogusDoctypeState
|
||
|
return True
|
||
|
|
||
|
def betweenDoctypePublicAndSystemIdentifiersState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
pass
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data == '"':
|
||
|
self.currentToken["systemId"] = u""
|
||
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
||
|
elif data == "'":
|
||
|
self.currentToken["systemId"] = u""
|
||
|
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
||
|
elif data == EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.state = self.bogusDoctypeState
|
||
|
return True
|
||
|
|
||
|
def afterDoctypeSystemKeywordState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
self.state = self.beforeDoctypeSystemIdentifierState
|
||
|
elif data in ("'", '"'):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeDoctypeSystemIdentifierState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.stream.unget(data)
|
||
|
self.state = self.beforeDoctypeSystemIdentifierState
|
||
|
return True
|
||
|
|
||
|
def beforeDoctypeSystemIdentifierState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
pass
|
||
|
elif data == "\"":
|
||
|
self.currentToken["systemId"] = u""
|
||
|
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
||
|
elif data == "'":
|
||
|
self.currentToken["systemId"] = u""
|
||
|
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.state = self.bogusDoctypeState
|
||
|
return True
|
||
|
|
||
|
def doctypeSystemIdentifierDoubleQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "\"":
|
||
|
self.state = self.afterDoctypeSystemIdentifierState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["systemId"] += u"\uFFFD"
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-end-of-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["systemId"] += data
|
||
|
return True
|
||
|
|
||
|
def doctypeSystemIdentifierSingleQuotedState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == "'":
|
||
|
self.state = self.afterDoctypeSystemIdentifierState
|
||
|
elif data == u"\u0000":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
self.currentToken["systemId"] += u"\uFFFD"
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-end-of-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.currentToken["systemId"] += data
|
||
|
return True
|
||
|
|
||
|
def afterDoctypeSystemIdentifierState(self):
|
||
|
data = self.stream.char()
|
||
|
if data in spaceCharacters:
|
||
|
pass
|
||
|
elif data == ">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"eof-in-doctype"})
|
||
|
self.currentToken["correct"] = False
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||
|
"unexpected-char-in-doctype"})
|
||
|
self.state = self.bogusDoctypeState
|
||
|
return True
|
||
|
|
||
|
def bogusDoctypeState(self):
|
||
|
data = self.stream.char()
|
||
|
if data == u">":
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
elif data is EOF:
|
||
|
# XXX EMIT
|
||
|
self.stream.unget(data)
|
||
|
self.tokenQueue.append(self.currentToken)
|
||
|
self.state = self.dataState
|
||
|
else:
|
||
|
pass
|
||
|
return True
|
||
|
|
||
|
def cdataSectionState(self):
|
||
|
data = []
|
||
|
while True:
|
||
|
data.append(self.stream.charsUntil(u"]"))
|
||
|
charStack = []
|
||
|
|
||
|
for expected in ["]", "]", ">"]:
|
||
|
charStack.append(self.stream.char())
|
||
|
matched = True
|
||
|
if charStack[-1] == EOF:
|
||
|
data.extend(charStack[:-1])
|
||
|
break
|
||
|
elif charStack[-1] != expected:
|
||
|
matched = False
|
||
|
data.extend(charStack)
|
||
|
break
|
||
|
|
||
|
if matched:
|
||
|
break
|
||
|
data = "".join(data)
|
||
|
#Deal with null here rather than in the parser
|
||
|
nullCount = data.count(u"\u0000")
|
||
|
if nullCount > 0:
|
||
|
for i in xrange(nullCount):
|
||
|
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||
|
"data": "invalid-codepoint"})
|
||
|
data = data.replace(u"\u0000", u"\uFFFD")
|
||
|
if data:
|
||
|
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||
|
"data": data})
|
||
|
self.state = self.dataState
|
||
|
return True
|