try: from types import ModuleType except: from new import module as ModuleType import re import types import _base from html5lib import ihatexml from html5lib import constants from html5lib.constants import namespaces tag_regexp = re.compile("{([^}]*)}(.*)") moduleCache = {} def getETreeModule(ElementTreeImplementation, fullTree=False): name = "_" + ElementTreeImplementation.__name__+"builder" if name in moduleCache: return moduleCache[name] else: mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder") objs = getETreeBuilder(ElementTreeImplementation, fullTree) mod.__dict__.update(objs) moduleCache[name] = mod return mod def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation class Element(_base.Node): def __init__(self, name, namespace=None): self._name = name self._namespace = namespace self._element = ElementTree.Element(self._getETreeTag(name, namespace)) if namespace is None: self.nameTuple = namespaces["html"], self._name else: self.nameTuple = self._namespace, self._name self.parent = None self._childNodes = [] self._flags = [] def _getETreeTag(self, name, namespace): if namespace is None: etree_tag = name else: etree_tag = "{%s}%s"%(namespace, name) return etree_tag def _setName(self, name): self._name = name self._element.tag = self._getETreeTag(self._name, self._namespace) def _getName(self): return self._name name = property(_getName, _setName) def _setNamespace(self, namespace): self._namespace = namespace self._element.tag = self._getETreeTag(self._name, self._namespace) def _getNamespace(self): return self._namespace namespace = property(_getNamespace, _setNamespace) def _getAttributes(self): return self._element.attrib def _setAttributes(self, attributes): #Delete existing attributes first #XXX - there may be a better way to do this... for key in self._element.attrib.keys(): del self._element.attrib[key] for key, value in attributes.iteritems(): if isinstance(key, tuple): name = "{%s}%s"%(key[2], key[1]) else: name = key self._element.set(name, value) attributes = property(_getAttributes, _setAttributes) def _getChildNodes(self): return self._childNodes def _setChildNodes(self, value): del self._element[:] self._childNodes = [] for element in value: self.insertChild(element) childNodes = property(_getChildNodes, _setChildNodes) def hasContent(self): """Return true if the node has children or text""" return bool(self._element.text or len(self._element)) def appendChild(self, node): self._childNodes.append(node) self._element.append(node._element) node.parent = self def insertBefore(self, node, refNode): index = list(self._element).index(refNode._element) self._element.insert(index, node._element) node.parent = self def removeChild(self, node): self._element.remove(node._element) node.parent=None def insertText(self, data, insertBefore=None): if not(len(self._element)): if not self._element.text: self._element.text = "" self._element.text += data elif insertBefore is None: #Insert the text as the tail of the last child element if not self._element[-1].tail: self._element[-1].tail = "" self._element[-1].tail += data else: #Insert the text before the specified node children = list(self._element) index = children.index(insertBefore._element) if index > 0: if not self._element[index-1].tail: self._element[index-1].tail = "" self._element[index-1].tail += data else: if not self._element.text: self._element.text = "" self._element.text += data def cloneNode(self): element = type(self)(self.name, self.namespace) for name, value in self.attributes.iteritems(): element.attributes[name] = value return element def reparentChildren(self, newParent): if newParent.childNodes: newParent.childNodes[-1]._element.tail += self._element.text else: if not newParent._element.text: newParent._element.text = "" if self._element.text is not None: newParent._element.text += self._element.text self._element.text = "" _base.Node.reparentChildren(self, newParent) class Comment(Element): def __init__(self, data): #Use the superclass constructor to set all properties on the #wrapper element self._element = ElementTree.Comment(data) self.parent = None self._childNodes = [] self._flags = [] def _getData(self): return self._element.text def _setData(self, value): self._element.text = value data = property(_getData, _setData) class DocumentType(Element): def __init__(self, name, publicId, systemId): Element.__init__(self, "") self._element.text = name self.publicId = publicId self.systemId = systemId def _getPublicId(self): return self._element.get(u"publicId", "") def _setPublicId(self, value): if value is not None: self._element.set(u"publicId", value) publicId = property(_getPublicId, _setPublicId) def _getSystemId(self): return self._element.get(u"systemId", "") def _setSystemId(self, value): if value is not None: self._element.set(u"systemId", value) systemId = property(_getSystemId, _setSystemId) class Document(Element): def __init__(self): Element.__init__(self, "") class DocumentFragment(Element): def __init__(self): Element.__init__(self, "") def testSerializer(element): rv = [] finalText = None def serializeElement(element, indent=0): if not(hasattr(element, "tag")): element = element.getroot() if element.tag == "": if element.get("publicId") or element.get("systemId"): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" rv.append( """"""%( element.text, publicId, systemId)) else: rv.append(""%(element.text,)) elif element.tag == "": rv.append("#document") if element.text: rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) if element.tail: finalText = element.tail elif element.tag == ElementTree.Comment: rv.append("|%s"%(' '*indent, element.text)) else: assert type(element.tag) in types.StringTypes, "Expected unicode, got %s"%type(element.tag) nsmatch = tag_regexp.match(element.tag) if nsmatch is None: name = element.tag else: ns, name = nsmatch.groups() prefix = constants.prefixes[ns] name = "%s %s"%(prefix, name) rv.append("|%s<%s>"%(' '*indent, name)) if hasattr(element, "attrib"): attributes = [] for name, value in element.attrib.iteritems(): nsmatch = tag_regexp.match(name) if nsmatch is not None: ns, name = nsmatch.groups() prefix = constants.prefixes[ns] attr_string = "%s %s"%(prefix, name) else: attr_string = name attributes.append((attr_string, value)) for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) if element.text: rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) indent += 2 for child in element: serializeElement(child, indent) if element.tail: rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) serializeElement(element, 0) if finalText is not None: rv.append("|%s\"%s\""%(' '*2, finalText)) return "\n".join(rv) def tostring(element): """Serialize an element and its child nodes to a string""" rv = [] finalText = None filter = ihatexml.InfosetFilter() def serializeElement(element): if type(element) == type(ElementTree.ElementTree): element = element.getroot() if element.tag == "": if element.get("publicId") or element.get("systemId"): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" rv.append( """"""%( element.text, publicId, systemId)) else: rv.append(""%(element.text,)) elif element.tag == "": if element.text: rv.append(element.text) if element.tail: finalText = element.tail for child in element: serializeElement(child) elif type(element.tag) == type(ElementTree.Comment): rv.append(""%(element.text,)) else: #This is assumed to be an ordinary element if not element.attrib: rv.append("<%s>"%(filter.fromXmlName(element.tag),)) else: attr = " ".join(["%s=\"%s\""%( filter.fromXmlName(name), value) for name, value in element.attrib.iteritems()]) rv.append("<%s %s>"%(element.tag, attr)) if element.text: rv.append(element.text) for child in element: serializeElement(child) rv.append(""%(element.tag,)) if element.tail: rv.append(element.tail) serializeElement(element) if finalText is not None: rv.append("%s\""%(' '*2, finalText)) return "".join(rv) class TreeBuilder(_base.TreeBuilder): documentClass = Document doctypeClass = DocumentType elementClass = Element commentClass = Comment fragmentClass = DocumentFragment def testSerializer(self, element): return testSerializer(element) def getDocument(self): if fullTree: return self.document._element else: if self.defaultNamespace is not None: return self.document._element.find( "{%s}html"%self.defaultNamespace) else: return self.document._element.find("html") def getFragment(self): return _base.TreeBuilder.getFragment(self)._element return locals()