diff --git a/lib/lxml/ElementInclude.py b/lib/lxml/ElementInclude.py deleted file mode 100644 index f7806709..00000000 --- a/lib/lxml/ElementInclude.py +++ /dev/null @@ -1,223 +0,0 @@ -# -# ElementTree -# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $ -# -# limited xinclude support for element trees -# -# history: -# 2003-08-15 fl created -# 2003-11-14 fl fixed default loader -# -# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved. -# -# fredrik@pythonware.com -# http://www.pythonware.com -# -# -------------------------------------------------------------------- -# The ElementTree toolkit is -# -# Copyright (c) 1999-2004 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its -# associated documentation, you agree that you have read, understood, -# and will comply with the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and -# its associated documentation for any purpose and without fee is -# hereby granted, provided that the above copyright notice appears in -# all copies, and that both that copyright notice and this permission -# notice appear in supporting documentation, and that the name of -# Secret Labs AB or the author not be used in advertising or publicity -# pertaining to distribution of the software without specific, written -# prior permission. -# -# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD -# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- -# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR -# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY -# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -# OF THIS SOFTWARE. -# -------------------------------------------------------------------- - -""" -Limited XInclude support for the ElementTree package. - -While lxml.etree has full support for XInclude (see -`etree.ElementTree.xinclude()`), this module provides a simpler, pure -Python, ElementTree compatible implementation that supports a simple -form of custom URL resolvers. -""" - -from lxml import etree -import copy -try: - from urlparse import urljoin - from urllib2 import urlopen -except ImportError: - # Python 3 - from urllib.parse import urljoin - from urllib.request import urlopen - -try: - set -except NameError: - # Python 2.3 - from sets import Set as set - -XINCLUDE = "{http://www.w3.org/2001/XInclude}" - -XINCLUDE_INCLUDE = XINCLUDE + "include" -XINCLUDE_FALLBACK = XINCLUDE + "fallback" - -## -# Fatal include error. - -class FatalIncludeError(etree.LxmlSyntaxError): - pass - -## -# ET compatible default loader. -# This loader reads an included resource from disk. -# -# @param href Resource reference. -# @param parse Parse mode. Either "xml" or "text". -# @param encoding Optional text encoding. -# @return The expanded resource. If the parse mode is "xml", this -# is an ElementTree instance. If the parse mode is "text", this -# is a Unicode string. If the loader fails, it can return None -# or raise an IOError exception. -# @throws IOError If the loader fails to load the resource. - -def default_loader(href, parse, encoding=None): - file = open(href, 'rb') - if parse == "xml": - data = etree.parse(file).getroot() - else: - data = file.read() - if not encoding: - encoding = 'utf-8' - data = data.decode(encoding) - file.close() - return data - -## -# Default loader used by lxml.etree - handles custom resolvers properly -# - -def _lxml_default_loader(href, parse, encoding=None, parser=None): - if parse == "xml": - data = etree.parse(href, parser).getroot() - else: - if "://" in href: - f = urlopen(href) - else: - f = open(href, 'rb') - data = f.read() - f.close() - if not encoding: - encoding = 'utf-8' - data = data.decode(encoding) - return data - -## -# Wrapper for ET compatibility - drops the parser - -def _wrap_et_loader(loader): - def load(href, parse, encoding=None, parser=None): - return loader(href, parse, encoding) - return load - - -## -# Expand XInclude directives. -# -# @param elem Root element. -# @param loader Optional resource loader. If omitted, it defaults -# to {@link default_loader}. If given, it should be a callable -# that implements the same interface as default_loader. -# @throws FatalIncludeError If the function fails to include a given -# resource, or if the tree contains malformed XInclude elements. -# @throws IOError If the function fails to load a given resource. -# @returns the node or its replacement if it was an XInclude node - -def include(elem, loader=None, base_url=None): - if base_url is None: - if hasattr(elem, 'getroot'): - tree = elem - elem = elem.getroot() - else: - tree = elem.getroottree() - if hasattr(tree, 'docinfo'): - base_url = tree.docinfo.URL - elif hasattr(elem, 'getroot'): - elem = elem.getroot() - _include(elem, loader, base_url=base_url) - -def _include(elem, loader=None, _parent_hrefs=None, base_url=None): - if loader is not None: - load_include = _wrap_et_loader(loader) - else: - load_include = _lxml_default_loader - - if _parent_hrefs is None: - _parent_hrefs = set() - - parser = elem.getroottree().parser - - include_elements = list( - elem.iter('{http://www.w3.org/2001/XInclude}*')) - - for e in include_elements: - if e.tag == XINCLUDE_INCLUDE: - # process xinclude directive - href = urljoin(base_url, e.get("href")) - parse = e.get("parse", "xml") - parent = e.getparent() - if parse == "xml": - if href in _parent_hrefs: - raise FatalIncludeError( - "recursive include of %r detected" % href - ) - _parent_hrefs.add(href) - node = load_include(href, parse, parser=parser) - if node is None: - raise FatalIncludeError( - "cannot load %r as %r" % (href, parse) - ) - node = _include(node, loader, _parent_hrefs) - if e.tail: - node.tail = (node.tail or "") + e.tail - if parent is None: - return node # replaced the root node! - parent.replace(e, node) - elif parse == "text": - text = load_include(href, parse, encoding=e.get("encoding")) - if text is None: - raise FatalIncludeError( - "cannot load %r as %r" % (href, parse) - ) - predecessor = e.getprevious() - if predecessor is not None: - predecessor.tail = (predecessor.tail or "") + text - elif parent is None: - return text # replaced the root node! - else: - parent.text = (parent.text or "") + text + (e.tail or "") - parent.remove(e) - else: - raise FatalIncludeError( - "unknown parse type in xi:include tag (%r)" % parse - ) - elif e.tag == XINCLUDE_FALLBACK: - parent = e.getparent() - if parent is not None and parent.tag != XINCLUDE_INCLUDE: - raise FatalIncludeError( - "xi:fallback tag must be child of xi:include (%r)" % e.tag - ) - else: - raise FatalIncludeError( - "Invalid element found in XInclude namespace (%r)" % e.tag - ) - return elem diff --git a/lib/lxml/__init__.py b/lib/lxml/__init__.py deleted file mode 100644 index 07cbe3a2..00000000 --- a/lib/lxml/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# this is a package - -def get_include(): - """ - Returns a list of header include paths (for lxml itself, libxml2 - and libxslt) needed to compile C code against lxml if it was built - with statically linked libraries. - """ - import os - lxml_path = __path__[0] - include_path = os.path.join(lxml_path, 'includes') - includes = [include_path, lxml_path] - - for name in os.listdir(include_path): - path = os.path.join(include_path, name) - if os.path.isdir(path): - includes.append(path) - - return includes - diff --git a/lib/lxml/_elementpath.py b/lib/lxml/_elementpath.py deleted file mode 100644 index bc9176e8..00000000 --- a/lib/lxml/_elementpath.py +++ /dev/null @@ -1,306 +0,0 @@ -# -# ElementTree -# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ -# -# limited xpath support for element trees -# -# history: -# 2003-05-23 fl created -# 2003-05-28 fl added support for // etc -# 2003-08-27 fl fixed parsing of periods in element names -# 2007-09-10 fl new selection engine -# 2007-09-12 fl fixed parent selector -# 2007-09-13 fl added iterfind; changed findall to return a list -# 2007-11-30 fl added namespaces support -# 2009-10-30 fl added child element value filter -# -# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. -# -# fredrik@pythonware.com -# http://www.pythonware.com -# -# -------------------------------------------------------------------- -# The ElementTree toolkit is -# -# Copyright (c) 1999-2009 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its -# associated documentation, you agree that you have read, understood, -# and will comply with the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and -# its associated documentation for any purpose and without fee is -# hereby granted, provided that the above copyright notice appears in -# all copies, and that both that copyright notice and this permission -# notice appear in supporting documentation, and that the name of -# Secret Labs AB or the author not be used in advertising or publicity -# pertaining to distribution of the software without specific, written -# prior permission. -# -# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD -# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- -# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR -# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY -# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -# OF THIS SOFTWARE. -# -------------------------------------------------------------------- - -## -# Implementation module for XPath support. There's usually no reason -# to import this module directly; the ElementTree does this for -# you, if needed. -## - -import re - -xpath_tokenizer_re = re.compile( - "(" - "'[^']*'|\"[^\"]*\"|" - "::|" - "//?|" - "\.\.|" - "\(\)|" - "[/.*:\[\]\(\)@=])|" - "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" - "\s+" - ) - -def xpath_tokenizer(pattern, namespaces=None): - for token in xpath_tokenizer_re.findall(pattern): - tag = token[1] - if tag and tag[0] != "{" and ":" in tag: - try: - prefix, uri = tag.split(":", 1) - if not namespaces: - raise KeyError - yield token[0], "{%s}%s" % (namespaces[prefix], uri) - except KeyError: - raise SyntaxError("prefix %r not found in prefix map" % prefix) - else: - yield token - - -def prepare_child(next, token): - tag = token[1] - def select(result): - for elem in result: - for e in elem.iterchildren(tag): - yield e - return select - -def prepare_star(next, token): - def select(result): - for elem in result: - for e in elem.iterchildren('*'): - yield e - return select - -def prepare_self(next, token): - def select(result): - return result - return select - -def prepare_descendant(next, token): - token = next() - if token[0] == "*": - tag = "*" - elif not token[0]: - tag = token[1] - else: - raise SyntaxError("invalid descendant") - def select(result): - for elem in result: - for e in elem.iterdescendants(tag): - yield e - return select - -def prepare_parent(next, token): - def select(result): - for elem in result: - parent = elem.getparent() - if parent is not None: - yield parent - return select - -def prepare_predicate(next, token): - # FIXME: replace with real parser!!! refs: - # http://effbot.org/zone/simple-iterator-parser.htm - # http://javascript.crockford.com/tdop/tdop.html - signature = [] - predicate = [] - while 1: - token = next() - if token[0] == "]": - break - if token[0] and token[0][:1] in "'\"": - token = "'", token[0][1:-1] - signature.append(token[0] or "-") - predicate.append(token[1]) - signature = "".join(signature) - # use signature to determine predicate type - if signature == "@-": - # [@attribute] predicate - key = predicate[1] - def select(result): - for elem in result: - if elem.get(key) is not None: - yield elem - return select - if signature == "@-='": - # [@attribute='value'] - key = predicate[1] - value = predicate[-1] - def select(result): - for elem in result: - if elem.get(key) == value: - yield elem - return select - if signature == "-" and not re.match("-?\d+$", predicate[0]): - # [tag] - tag = predicate[0] - def select(result): - for elem in result: - for _ in elem.iterchildren(tag): - yield elem - break - return select - if signature == "-='" and not re.match("-?\d+$", predicate[0]): - # [tag='value'] - tag = predicate[0] - value = predicate[-1] - def select(result): - for elem in result: - for e in elem.iterchildren(tag): - if "".join(e.itertext()) == value: - yield elem - break - return select - if signature == "-" or signature == "-()" or signature == "-()-": - # [index] or [last()] or [last()-index] - if signature == "-": - # [index] - index = int(predicate[0]) - 1 - if index < 0: - if index == -1: - raise SyntaxError( - "indices in path predicates are 1-based, not 0-based") - else: - raise SyntaxError("path index >= 1 expected") - else: - if predicate[0] != "last": - raise SyntaxError("unsupported function") - if signature == "-()-": - try: - index = int(predicate[2]) - 1 - except ValueError: - raise SyntaxError("unsupported expression") - else: - index = -1 - def select(result): - for elem in result: - parent = elem.getparent() - if parent is None: - continue - try: - # FIXME: what if the selector is "*" ? - elems = list(parent.iterchildren(elem.tag)) - if elems[index] is elem: - yield elem - except IndexError: - pass - return select - raise SyntaxError("invalid predicate") - -ops = { - "": prepare_child, - "*": prepare_star, - ".": prepare_self, - "..": prepare_parent, - "//": prepare_descendant, - "[": prepare_predicate, - } - -_cache = {} - -# -------------------------------------------------------------------- - -def _build_path_iterator(path, namespaces): - # compile selector pattern - if path[-1:] == "/": - path = path + "*" # implicit all (FIXME: keep this?) - try: - return _cache[(path, namespaces and tuple(sorted(namespaces.items())) or None)] - except KeyError: - pass - if len(_cache) > 100: - _cache.clear() - - if path[:1] == "/": - raise SyntaxError("cannot use absolute path on element") - stream = iter(xpath_tokenizer(path, namespaces)) - try: - _next = stream.next - except AttributeError: - # Python 3 - _next = stream.__next__ - try: - token = _next() - except StopIteration: - raise SyntaxError("empty path expression") - selector = [] - while 1: - try: - selector.append(ops[token[0]](_next, token)) - except StopIteration: - raise SyntaxError("invalid path") - try: - token = _next() - if token[0] == "/": - token = _next() - except StopIteration: - break - _cache[path] = selector - return selector - -## -# Iterate over the matching nodes - -def iterfind(elem, path, namespaces=None): - selector = _build_path_iterator(path, namespaces) - result = iter((elem,)) - for select in selector: - result = select(result) - return result - -## -# Find first matching object. - -def find(elem, path, namespaces=None): - it = iterfind(elem, path, namespaces) - try: - try: - _next = it.next - except AttributeError: - return next(it) - else: - return _next() - except StopIteration: - return None - -## -# Find all matching objects. - -def findall(elem, path, namespaces=None): - return list(iterfind(elem, path, namespaces)) - -## -# Find text for first matching object. - -def findtext(elem, path, default=None, namespaces=None): - el = find(elem, path, namespaces) - if el is None: - return default - else: - return el.text or '' diff --git a/lib/lxml/apihelpers.pxi b/lib/lxml/apihelpers.pxi deleted file mode 100644 index c41e3044..00000000 --- a/lib/lxml/apihelpers.pxi +++ /dev/null @@ -1,1645 +0,0 @@ -# Private/public helper functions for API functions - -from lxml.includes cimport uri - -cdef object OrderedDict = None -try: - from collections import OrderedDict -except ImportError: - pass - -cdef void displayNode(xmlNode* c_node, indent): - # to help with debugging - cdef xmlNode* c_child - try: - print indent * u' ', c_node - c_child = c_node.children - while c_child is not NULL: - displayNode(c_child, indent + 1) - c_child = c_child.next - finally: - return # swallow any exceptions - -cdef inline int _assertValidNode(_Element element) except -1: - assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element) - -cdef inline int _assertValidDoc(_Document doc) except -1: - assert doc._c_doc is not NULL, u"invalid Document proxy at %s" % id(doc) - -cdef _Document _documentOrRaise(object input): - u"""Call this to get the document of a _Document, _ElementTree or _Element - object, or to raise an exception if it can't be determined. - - Should be used in all API functions for consistency. - """ - cdef _Document doc - if isinstance(input, _ElementTree): - if (<_ElementTree>input)._context_node is not None: - doc = (<_ElementTree>input)._context_node._doc - else: - doc = None - elif isinstance(input, _Element): - doc = (<_Element>input)._doc - elif isinstance(input, _Document): - doc = <_Document>input - else: - raise TypeError, u"Invalid input object: %s" % \ - python._fqtypename(input).decode('utf8') - if doc is None: - raise ValueError, u"Input object has no document: %s" % \ - python._fqtypename(input).decode('utf8') - _assertValidDoc(doc) - return doc - -cdef _Element _rootNodeOrRaise(object input): - u"""Call this to get the root node of a _Document, _ElementTree or - _Element object, or to raise an exception if it can't be determined. - - Should be used in all API functions for consistency. - """ - cdef _Element node - if isinstance(input, _ElementTree): - node = (<_ElementTree>input)._context_node - elif isinstance(input, _Element): - node = <_Element>input - elif isinstance(input, _Document): - node = (<_Document>input).getroot() - else: - raise TypeError, u"Invalid input object: %s" % \ - python._fqtypename(input).decode('utf8') - if (node is None or not node._c_node or - node._c_node.type != tree.XML_ELEMENT_NODE): - raise ValueError, u"Input object has no element: %s" % \ - python._fqtypename(input).decode('utf8') - _assertValidNode(node) - return node - -cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, - _BaseParser parser, text, tail, attrib, nsmap, - dict extra_attrs): - u"""Create a new element and initialize text content, namespaces and - attributes. - - This helper function will reuse as much of the existing document as - possible: - - If 'parser' is None, the parser will be inherited from 'doc' or the - default parser will be used. - - If 'doc' is None, 'c_doc' is used to create a new _Document and the new - element is made its root node. - - If 'c_doc' is also NULL, a new xmlDoc will be created. - """ - cdef xmlNode* c_node - if doc is not None: - c_doc = doc._c_doc - ns_utf, name_utf = _getNsTag(tag) - if parser is not None and parser._for_html: - _htmlTagValidOrRaise(name_utf) - if c_doc is NULL: - c_doc = _newHTMLDoc() - else: - _tagValidOrRaise(name_utf) - if c_doc is NULL: - c_doc = _newXMLDoc() - c_node = _createElement(c_doc, name_utf) - if c_node is NULL: - if doc is None and c_doc is not NULL: - tree.xmlFreeDoc(c_doc) - raise MemoryError() - try: - if doc is None: - tree.xmlDocSetRootElement(c_doc, c_node) - doc = _documentFactory(c_doc, parser) - if text is not None: - _setNodeText(c_node, text) - if tail is not None: - _setTailText(c_node, tail) - # add namespaces to node if necessary - _initNodeNamespaces(c_node, doc, ns_utf, nsmap) - _initNodeAttributes(c_node, doc, attrib, extra_attrs) - return _elementFactory(doc, c_node) - except: - # free allocated c_node/c_doc unless Python does it for us - if c_node.doc is not c_doc: - # node not yet in document => will not be freed by document - if tail is not None: - _removeText(c_node.next) # tail - tree.xmlFreeNode(c_node) - if doc is None: - # c_doc will not be freed by doc - tree.xmlFreeDoc(c_doc) - raise - -cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf, - _BaseParser parser, attrib, nsmap, dict extra_attrs) except -1: - u"""Initialise a new Element object. - - This is used when users instantiate a Python Element subclass - directly, without it being mapped to an existing XML node. - """ - cdef xmlDoc* c_doc - cdef xmlNode* c_node - cdef _Document doc - if is_html: - _htmlTagValidOrRaise(name_utf) - c_doc = _newHTMLDoc() - else: - _tagValidOrRaise(name_utf) - c_doc = _newXMLDoc() - c_node = _createElement(c_doc, name_utf) - if c_node is NULL: - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) - raise MemoryError() - tree.xmlDocSetRootElement(c_doc, c_node) - doc = _documentFactory(c_doc, parser) - # add namespaces to node if necessary - _initNodeNamespaces(c_node, doc, ns_utf, nsmap) - _initNodeAttributes(c_node, doc, attrib, extra_attrs) - _registerProxy(element, doc, c_node) - element._init() - return 0 - -cdef _Element _makeSubElement(_Element parent, tag, text, tail, - attrib, nsmap, dict extra_attrs): - u"""Create a new child element and initialize text content, namespaces and - attributes. - """ - cdef xmlNode* c_node - cdef xmlDoc* c_doc - if parent is None or parent._doc is None: - return None - _assertValidNode(parent) - ns_utf, name_utf = _getNsTag(tag) - c_doc = parent._doc._c_doc - - if parent._doc._parser is not None and parent._doc._parser._for_html: - _htmlTagValidOrRaise(name_utf) - else: - _tagValidOrRaise(name_utf) - - c_node = _createElement(c_doc, name_utf) - if c_node is NULL: - raise MemoryError() - tree.xmlAddChild(parent._c_node, c_node) - - try: - if text is not None: - _setNodeText(c_node, text) - if tail is not None: - _setTailText(c_node, tail) - - # add namespaces to node if necessary - _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) - _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) - return _elementFactory(parent._doc, c_node) - except: - # make sure we clean up in case of an error - _removeNode(parent._doc, c_node) - raise - -cdef int _initNodeNamespaces(xmlNode* c_node, _Document doc, - object node_ns_utf, object nsmap) except -1: - u"""Lookup current namespace prefixes, then set namespace structure for - node and register new ns-prefix mappings. - - This only works for a newly created node! - """ - cdef xmlNs* c_ns - cdef list nsdefs - if not nsmap: - if node_ns_utf is not None: - _uriValidOrRaise(node_ns_utf) - doc._setNodeNs(c_node, _xcstr(node_ns_utf)) - return 0 - - nsdefs = list(nsmap.items()) - if None in nsmap and len(nsdefs) > 1: - # Move the default namespace to the end. This makes sure libxml2 - # prefers a prefix if the ns is defined redundantly on the same - # element. That way, users can work around a problem themselves - # where default namespace attributes on non-default namespaced - # elements serialise without prefix (i.e. into the non-default - # namespace). - item = (None, nsmap[None]) - nsdefs.remove(item) - nsdefs.append(item) - - for prefix, href in nsdefs: - href_utf = _utf8(href) - _uriValidOrRaise(href_utf) - c_href = _xcstr(href_utf) - if prefix is not None: - prefix_utf = _utf8(prefix) - _prefixValidOrRaise(prefix_utf) - c_prefix = _xcstr(prefix_utf) - else: - c_prefix = NULL - # add namespace with prefix if it is not already known - c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix) - if c_ns is NULL or \ - c_ns.href is NULL or \ - tree.xmlStrcmp(c_ns.href, c_href) != 0: - c_ns = tree.xmlNewNs(c_node, c_href, c_prefix) - if href_utf == node_ns_utf: - tree.xmlSetNs(c_node, c_ns) - node_ns_utf = None - - if node_ns_utf is not None: - doc._setNodeNs(c_node, _xcstr(node_ns_utf)) - return 0 - -cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra): - u"""Initialise the attributes of an element node. - """ - cdef bint is_html - cdef xmlNs* c_ns - if attrib is not None and not hasattr(attrib, u'items'): - raise TypeError, u"Invalid attribute dictionary: %s" % \ - python._fqtypename(attrib).decode('utf8') - if not attrib and not extra: - return # nothing to do - is_html = doc._parser._for_html - seen = set() - if extra: - for name, value in sorted(extra.items()): - _addAttributeToNode(c_node, doc, is_html, name, value, seen) - if attrib: - # attrib will usually be a plain unordered dict - if type(attrib) is dict: - attrib = sorted(attrib.items()) - elif isinstance(attrib, _Attrib) or ( - OrderedDict is not None and isinstance(attrib, OrderedDict)): - attrib = attrib.items() - else: - # assume it's an unordered mapping of some kind - attrib = sorted(attrib.items()) - for name, value in attrib: - _addAttributeToNode(c_node, doc, is_html, name, value, seen) - -cdef int _addAttributeToNode(xmlNode* c_node, _Document doc, bint is_html, - name, value, set seen_tags) except -1: - ns_utf, name_utf = tag = _getNsTag(name) - if tag in seen_tags: - return 0 - seen_tags.add(tag) - if not is_html: - _attributeValidOrRaise(name_utf) - value_utf = _utf8(value) - if ns_utf is None: - tree.xmlNewProp(c_node, _xcstr(name_utf), _xcstr(value_utf)) - else: - _uriValidOrRaise(ns_utf) - c_ns = doc._findOrBuildNodeNs(c_node, _xcstr(ns_utf), NULL, 1) - tree.xmlNewNsProp(c_node, c_ns, - _xcstr(name_utf), _xcstr(value_utf)) - return 0 - -ctypedef struct _ns_node_ref: - xmlNs* ns - xmlNode* node - -cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element) except -1: - u"""Remove any namespace declarations from a subtree that are not used by - any of its elements (or attributes). - """ - cdef _ns_node_ref* c_ns_list - cdef _ns_node_ref* c_nsref_ptr - cdef xmlNs* c_nsdef - cdef xmlNode* c_node - cdef size_t c_ns_list_size - cdef size_t c_ns_list_len - cdef size_t i - - c_ns_list = NULL - c_ns_list_size = 0 - c_ns_list_len = 0 - - if c_element.parent is not NULL and \ - c_element.parent.type == tree.XML_DOCUMENT_NODE: - # include the document node - c_nsdef = c_element.parent.nsDef - while c_nsdef is not NULL: - if c_ns_list_len >= c_ns_list_size: - if c_ns_list is NULL: - c_ns_list_size = 20 - else: - c_ns_list_size *= 2 - c_nsref_ptr = <_ns_node_ref*> stdlib.realloc( - c_ns_list, c_ns_list_size * sizeof(_ns_node_ref)) - if c_nsref_ptr is NULL: - if c_ns_list is not NULL: - stdlib.free(c_ns_list) - raise MemoryError() - c_ns_list = c_nsref_ptr - - c_ns_list[c_ns_list_len].ns = c_nsdef - c_ns_list[c_ns_list_len].node = c_element.parent - c_ns_list_len += 1 - c_nsdef = c_nsdef.next - - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1) - # collect all new namespace declarations into the ns list - c_nsdef = c_element.nsDef - while c_nsdef is not NULL: - if c_ns_list_len >= c_ns_list_size: - if c_ns_list is NULL: - c_ns_list_size = 20 - else: - c_ns_list_size *= 2 - c_nsref_ptr = <_ns_node_ref*> stdlib.realloc( - c_ns_list, c_ns_list_size * sizeof(_ns_node_ref)) - if c_nsref_ptr is NULL: - if c_ns_list is not NULL: - stdlib.free(c_ns_list) - raise MemoryError() - c_ns_list = c_nsref_ptr - - c_ns_list[c_ns_list_len].ns = c_nsdef - c_ns_list[c_ns_list_len].node = c_element - c_ns_list_len += 1 - c_nsdef = c_nsdef.next - - # remove all namespace declarations from the list that are referenced - if c_element.type == tree.XML_ELEMENT_NODE: - c_node = c_element - while c_node is not NULL: - if c_node.ns is not NULL: - for i in range(c_ns_list_len): - if c_node.ns is c_ns_list[i].ns: - c_ns_list_len -= 1 - c_ns_list[i].ns = c_ns_list[c_ns_list_len].ns - c_ns_list[i].node = c_ns_list[c_ns_list_len].node - c_ns_list[c_ns_list_len].ns = NULL - c_ns_list[c_ns_list_len].node = NULL - break - if c_node is c_element: - # continue with attributes - c_node = c_element.properties - else: - c_node = c_node.next - tree.END_FOR_EACH_ELEMENT_FROM(c_element) - - if c_ns_list is NULL: - return 0 - - # free all namespace declarations that remained in the list - for i in range(c_ns_list_len): - c_node = c_ns_list[i].node - c_nsdef = c_node.nsDef - if c_nsdef is c_ns_list[i].ns: - c_node.nsDef = c_node.nsDef.next - else: - while c_nsdef.next is not c_ns_list[i].ns: - c_nsdef = c_nsdef.next - c_nsdef.next = c_nsdef.next.next - tree.xmlFreeNs(c_ns_list[i].ns) - - if c_ns_list is not NULL: - stdlib.free(c_ns_list) - return 0 - -cdef xmlNs* _searchNsByHref(xmlNode* c_node, const_xmlChar* c_href, bint is_attribute): - u"""Search a namespace declaration that covers a node (element or - attribute). - - For attributes, try to find a prefixed namespace declaration - instead of the default namespaces. This helps in supporting - round-trips for attributes on elements with a different namespace. - """ - cdef xmlNs* c_ns - cdef xmlNs* c_default_ns = NULL - cdef xmlNode* c_element - if c_href is NULL or c_node is NULL or c_node.type == tree.XML_ENTITY_REF_NODE: - return NULL - if tree.xmlStrcmp(c_href, tree.XML_XML_NAMESPACE) == 0: - # no special cases here, let libxml2 handle this - return tree.xmlSearchNsByHref(c_node.doc, c_node, c_href) - if c_node.type == tree.XML_ATTRIBUTE_NODE: - is_attribute = 1 - while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE: - c_node = c_node.parent - c_element = c_node - while c_node is not NULL: - if c_node.type == tree.XML_ELEMENT_NODE: - c_ns = c_node.nsDef - while c_ns is not NULL: - if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0: - if c_ns.prefix is NULL and is_attribute: - # for attributes, continue searching a named - # prefix, but keep the first default namespace - # declaration that we found - if c_default_ns is NULL: - c_default_ns = c_ns - elif tree.xmlSearchNs( - c_element.doc, c_element, c_ns.prefix) is c_ns: - # start node is in namespace scope => found! - return c_ns - c_ns = c_ns.next - if c_node is not c_element and c_node.ns is not NULL: - # optimise: the node may have the namespace itself - c_ns = c_node.ns - if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0: - if c_ns.prefix is NULL and is_attribute: - # for attributes, continue searching a named - # prefix, but keep the first default namespace - # declaration that we found - if c_default_ns is NULL: - c_default_ns = c_ns - elif tree.xmlSearchNs( - c_element.doc, c_element, c_ns.prefix) is c_ns: - # start node is in namespace scope => found! - return c_ns - c_node = c_node.parent - # nothing found => use a matching default namespace or fail - if c_default_ns is not NULL: - if tree.xmlSearchNs(c_element.doc, c_element, NULL) is c_default_ns: - return c_default_ns - return NULL - -cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1: - # NOTE: this does not deallocate the node, just unlink it! - cdef xmlNode* c_parent - cdef xmlNode* c_child - if c_node.children is NULL: - tree.xmlUnlinkNode(c_node) - return 0 - - c_parent = c_node.parent - # fix parent links of children - c_child = c_node.children - while c_child is not NULL: - c_child.parent = c_parent - c_child = c_child.next - - # fix namespace references of children if their parent's namespace - # declarations get lost - if c_node.nsDef is not NULL: - c_child = c_node.children - while c_child is not NULL: - moveNodeToDocument(doc, doc._c_doc, c_child) - c_child = c_child.next - - # fix sibling links to/from child slice - if c_node.prev is NULL: - c_parent.children = c_node.children - else: - c_node.prev.next = c_node.children - c_node.children.prev = c_node.prev - if c_node.next is NULL: - c_parent.last = c_node.last - else: - c_node.next.prev = c_node.last - c_node.last.next = c_node.next - - # unlink c_node - c_node.children = c_node.last = NULL - c_node.parent = c_node.next = c_node.prev = NULL - return 0 - -cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node): - c_href = _getNs(c_attrib_node) - value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href) - try: - result = funicode(value) - finally: - tree.xmlFree(value) - return result - -cdef object _attributeValueFromNsName(xmlNode* c_element, - const_xmlChar* c_href, const_xmlChar* c_name): - c_result = tree.xmlGetNsProp(c_element, c_name, c_href) - if c_result is NULL: - return None - try: - result = funicode(c_result) - finally: - tree.xmlFree(c_result) - return result - -cdef object _getNodeAttributeValue(xmlNode* c_node, key, default): - ns, tag = _getNsTag(key) - c_href = NULL if ns is None else _xcstr(ns) - c_result = tree.xmlGetNsProp(c_node, _xcstr(tag), c_href) - if c_result is NULL: - # XXX free namespace that is not in use..? - return default - try: - result = funicode(c_result) - finally: - tree.xmlFree(c_result) - return result - -cdef inline object _getAttributeValue(_Element element, key, default): - return _getNodeAttributeValue(element._c_node, key, default) - -cdef int _setAttributeValue(_Element element, key, value) except -1: - cdef xmlNs* c_ns - ns, tag = _getNsTag(key) - if not element._doc._parser._for_html: - _attributeValidOrRaise(tag) - c_tag = _xcstr(tag) - if isinstance(value, QName): - value = _resolveQNameText(element, value) - else: - value = _utf8(value) - c_value = _xcstr(value) - if ns is None: - c_ns = NULL - else: - c_ns = element._doc._findOrBuildNodeNs(element._c_node, _xcstr(ns), NULL, 1) - tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value) - return 0 - -cdef int _delAttribute(_Element element, key) except -1: - ns, tag = _getNsTag(key) - c_href = NULL if ns is None else _xcstr(ns) - if _delAttributeFromNsName(element._c_node, c_href, _xcstr(tag)): - raise KeyError, key - return 0 - -cdef int _delAttributeFromNsName(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name): - c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) - if c_attr is NULL: - # XXX free namespace that is not in use..? - return -1 - tree.xmlRemoveProp(c_attr) - return 0 - -cdef list _collectAttributes(xmlNode* c_node, int collecttype): - u"""Collect all attributes of a node in a list. Depending on collecttype, - it collects either the name (1), the value (2) or the name-value tuples. - """ - cdef Py_ssize_t count - c_attr = c_node.properties - count = 0 - while c_attr is not NULL: - if c_attr.type == tree.XML_ATTRIBUTE_NODE: - count += 1 - c_attr = c_attr.next - - if not count: - return [] - - attributes = [None] * count - c_attr = c_node.properties - count = 0 - while c_attr is not NULL: - if c_attr.type == tree.XML_ATTRIBUTE_NODE: - if collecttype == 1: - item = _namespacedName(c_attr) - elif collecttype == 2: - item = _attributeValue(c_node, c_attr) - else: - item = (_namespacedName(c_attr), - _attributeValue(c_node, c_attr)) - attributes[count] = item - count += 1 - c_attr = c_attr.next - return attributes - -cdef object __RE_XML_ENCODING = re.compile( - ur'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) - -cdef object __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub -cdef object __HAS_XML_ENCODING = __RE_XML_ENCODING.match - -cdef object _stripEncodingDeclaration(object xml_string): - # this is a hack to remove the XML encoding declaration from unicode - return __REPLACE_XML_ENCODING(ur'\g<1>\g<2>', xml_string) - -cdef bint _hasEncodingDeclaration(object xml_string) except -1: - # check if a (unicode) string has an XML encoding declaration - return __HAS_XML_ENCODING(xml_string) is not None - -cdef inline bint _hasText(xmlNode* c_node): - return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL - -cdef inline bint _hasTail(xmlNode* c_node): - return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL - -cdef _collectText(xmlNode* c_node): - u"""Collect all text nodes and return them as a unicode string. - - Start collecting at c_node. - - If there was no text to collect, return None - """ - cdef Py_ssize_t scount - cdef xmlChar* c_text - cdef xmlNode* c_node_cur - # check for multiple text nodes - scount = 0 - c_text = NULL - c_node_cur = c_node = _textNodeOrSkip(c_node) - while c_node_cur is not NULL: - if c_node_cur.content[0] != c'\0': - c_text = c_node_cur.content - scount += 1 - c_node_cur = _textNodeOrSkip(c_node_cur.next) - - # handle two most common cases first - if c_text is NULL: - return '' if scount > 0 else None - if scount == 1: - return funicode(c_text) - - # the rest is not performance critical anymore - result = b'' - while c_node is not NULL: - result += c_node.content - c_node = _textNodeOrSkip(c_node.next) - return funicode(result) - -cdef void _removeText(xmlNode* c_node): - u"""Remove all text nodes. - - Start removing at c_node. - """ - cdef xmlNode* c_next - c_node = _textNodeOrSkip(c_node) - while c_node is not NULL: - c_next = _textNodeOrSkip(c_node.next) - tree.xmlUnlinkNode(c_node) - tree.xmlFreeNode(c_node) - c_node = c_next - -cdef int _setNodeText(xmlNode* c_node, value) except -1: - cdef xmlNode* c_text_node - # remove all text nodes at the start first - _removeText(c_node.children) - if value is None: - return 0 - # now add new text node with value at start - if python._isString(value): - text = _utf8(value) - c_text_node = tree.xmlNewDocText(c_node.doc, _xcstr(text)) - elif isinstance(value, CDATA): - c_text_node = tree.xmlNewCDataBlock( - c_node.doc, _xcstr((value)._utf8_data), - python.PyBytes_GET_SIZE((value)._utf8_data)) - else: - # this will raise the right error - _utf8(value) - return -1 - if c_node.children is NULL: - tree.xmlAddChild(c_node, c_text_node) - else: - tree.xmlAddPrevSibling(c_node.children, c_text_node) - return 0 - -cdef int _setTailText(xmlNode* c_node, value) except -1: - cdef xmlNode* c_text_node - # remove all text nodes at the start first - _removeText(c_node.next) - if value is None: - return 0 - text = _utf8(value) - c_text_node = tree.xmlNewDocText(c_node.doc, _xcstr(text)) - # XXX what if we're the top element? - tree.xmlAddNextSibling(c_node, c_text_node) - return 0 - -cdef bytes _resolveQNameText(_Element element, value): - cdef xmlNs* c_ns - ns, tag = _getNsTag(value) - if ns is None: - return tag - else: - c_ns = element._doc._findOrBuildNodeNs( - element._c_node, _xcstr(ns), NULL, 0) - return python.PyBytes_FromFormat('%s:%s', c_ns.prefix, _cstr(tag)) - -cdef inline bint _hasChild(xmlNode* c_node): - return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL - -cdef inline Py_ssize_t _countElements(xmlNode* c_node): - u"Counts the elements within the following siblings and the node itself." - cdef Py_ssize_t count - count = 0 - while c_node is not NULL: - if _isElement(c_node): - count += 1 - c_node = c_node.next - return count - -cdef int _findChildSlice( - slice sliceobject, xmlNode* c_parent, - xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1: - u"""Resolve a children slice. - - Returns the start node, step size and the slice length in the - pointer arguments. - """ - cdef Py_ssize_t start = 0, stop = 0, childcount - childcount = _countElements(c_parent.children) - if childcount == 0: - c_start_node[0] = NULL - c_length[0] = 0 - if sliceobject.step is None: - c_step[0] = 1 - else: - python._PyEval_SliceIndex(sliceobject.step, c_step) - return 0 - python.PySlice_GetIndicesEx( - sliceobject, childcount, &start, &stop, c_step, c_length) - if start > childcount / 2: - c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1) - else: - c_start_node[0] = _findChild(c_parent, start) - return 0 - -cdef bint _isFullSlice(slice sliceobject) except -1: - u"""Conservative guess if this slice is a full slice as in ``s[:]``. - """ - cdef Py_ssize_t step = 0 - if sliceobject is None: - return 0 - if sliceobject.start is None and \ - sliceobject.stop is None: - if sliceobject.step is None: - return 1 - python._PyEval_SliceIndex(sliceobject.step, &step) - if step == 1: - return 1 - return 0 - return 0 - -cdef _collectChildren(_Element element): - cdef xmlNode* c_node - cdef list result = [] - c_node = element._c_node.children - if c_node is not NULL: - if not _isElement(c_node): - c_node = _nextElement(c_node) - while c_node is not NULL: - result.append(_elementFactory(element._doc, c_node)) - c_node = _nextElement(c_node) - return result - -cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): - if index < 0: - return _findChildBackwards(c_node, -index - 1) - else: - return _findChildForwards(c_node, index) - -cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index): - u"""Return child element of c_node with index, or return NULL if not found. - """ - cdef xmlNode* c_child - cdef Py_ssize_t c - c_child = c_node.children - c = 0 - while c_child is not NULL: - if _isElement(c_child): - if c == index: - return c_child - c += 1 - c_child = c_child.next - return NULL - -cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index): - u"""Return child element of c_node with index, or return NULL if not found. - Search from the end. - """ - cdef xmlNode* c_child - cdef Py_ssize_t c - c_child = c_node.last - c = 0 - while c_child is not NULL: - if _isElement(c_child): - if c == index: - return c_child - c += 1 - c_child = c_child.prev - return NULL - -cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) nogil: - u"""Return the node if it's a text node. Skip over ignorable nodes in a - series of text nodes. Return NULL if a non-ignorable node is found. - - This is used to skip over XInclude nodes when collecting adjacent text - nodes. - """ - while c_node is not NULL: - if c_node.type == tree.XML_TEXT_NODE or \ - c_node.type == tree.XML_CDATA_SECTION_NODE: - return c_node - elif c_node.type == tree.XML_XINCLUDE_START or \ - c_node.type == tree.XML_XINCLUDE_END: - c_node = c_node.next - else: - return NULL - return NULL - -cdef inline xmlNode* _nextElement(xmlNode* c_node): - u"""Given a node, find the next sibling that is an element. - """ - if c_node is NULL: - return NULL - c_node = c_node.next - while c_node is not NULL: - if _isElement(c_node): - return c_node - c_node = c_node.next - return NULL - -cdef inline xmlNode* _previousElement(xmlNode* c_node): - u"""Given a node, find the next sibling that is an element. - """ - if c_node is NULL: - return NULL - c_node = c_node.prev - while c_node is not NULL: - if _isElement(c_node): - return c_node - c_node = c_node.prev - return NULL - -cdef inline xmlNode* _parentElement(xmlNode* c_node): - u"Given a node, find the parent element." - if c_node is NULL or not _isElement(c_node): - return NULL - c_node = c_node.parent - if c_node is NULL or not _isElement(c_node): - return NULL - return c_node - -cdef inline bint _tagMatches(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name): - u"""Tests if the node matches namespace URI and tag name. - - A node matches if it matches both c_href and c_name. - - A node matches c_href if any of the following is true: - * c_href is NULL - * its namespace is NULL and c_href is the empty string - * its namespace string equals the c_href string - - A node matches c_name if any of the following is true: - * c_name is NULL - * its name string equals the c_name string - """ - if c_node is NULL: - return 0 - if c_node.type != tree.XML_ELEMENT_NODE: - # not an element, only succeed if we match everything - return c_name is NULL and c_href is NULL - if c_name is NULL: - if c_href is NULL: - # always match - return 1 - else: - c_node_href = _getNs(c_node) - if c_node_href is NULL: - return c_href[0] == c'\0' - else: - return tree.xmlStrcmp(c_node_href, c_href) == 0 - elif c_href is NULL: - if _getNs(c_node) is not NULL: - return 0 - return c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0 - elif c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0: - c_node_href = _getNs(c_node) - if c_node_href is NULL: - return c_href[0] == c'\0' - else: - return tree.xmlStrcmp(c_node_href, c_href) == 0 - else: - return 0 - -cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname): - u"""Tests if the node matches namespace URI and tag name. - - This differs from _tagMatches() in that it does not consider a - NULL value in qname.href a wildcard, and that it expects the c_name - to be taken from the doc dict, i.e. it only compares the names by - address. - - A node matches if it matches both href and c_name of the qname. - - A node matches c_href if any of the following is true: - * its namespace is NULL and c_href is the empty string - * its namespace string equals the c_href string - - A node matches c_name if any of the following is true: - * c_name is NULL - * its name string points to the same address (!) as c_name - """ - return _nsTagMatchesExactly(_getNs(c_node), c_node.name, c_qname) - -cdef inline bint _nsTagMatchesExactly(const_xmlChar* c_node_href, - const_xmlChar* c_node_name, - qname* c_qname): - u"""Tests if name and namespace URI match those of c_qname. - - This differs from _tagMatches() in that it does not consider a - NULL value in qname.href a wildcard, and that it expects the c_name - to be taken from the doc dict, i.e. it only compares the names by - address. - - A node matches if it matches both href and c_name of the qname. - - A node matches c_href if any of the following is true: - * its namespace is NULL and c_href is the empty string - * its namespace string equals the c_href string - - A node matches c_name if any of the following is true: - * c_name is NULL - * its name string points to the same address (!) as c_name - """ - cdef char* c_href - if c_qname.c_name is not NULL and c_qname.c_name is not c_node_name: - return 0 - if c_qname.href is NULL: - return 1 - c_href = python.__cstr(c_qname.href) - if c_href[0] == '\0': - return c_node_href is NULL or c_node_href[0] == '\0' - elif c_node_href is NULL: - return 0 - else: - return tree.xmlStrcmp(c_href, c_node_href) == 0 - -cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags, - qname* c_ns_tags, bint force_into_dict) except -1: - u"""Map a sequence of (name, namespace) pairs to a qname array for efficient - matching with _tagMatchesExactly() above. - - Note that each qname struct in the array owns its href byte string object - if it is not NULL. - """ - cdef Py_ssize_t count = 0, i - cdef bytes ns, tag - for ns, tag in ns_tags: - if tag is None: - c_tag = NULL - elif force_into_dict: - c_tag = tree.xmlDictLookup(c_doc.dict, _xcstr(tag), len(tag)) - if c_tag is NULL: - # clean up before raising the error - for i in xrange(count): - cpython.ref.Py_XDECREF(c_ns_tags[i].href) - raise MemoryError() - else: - c_tag = tree.xmlDictExists(c_doc.dict, _xcstr(tag), len(tag)) - if c_tag is NULL: - # not in the dict => not in the document - continue - c_ns_tags[count].c_name = c_tag - if ns is None: - c_ns_tags[count].href = NULL - else: - cpython.ref.Py_INCREF(ns) # keep an owned reference! - c_ns_tags[count].href = ns - count += 1 - return count - -cdef int _removeNode(_Document doc, xmlNode* c_node) except -1: - u"""Unlink and free a node and subnodes if possible. Otherwise, make sure - it's self-contained. - """ - cdef xmlNode* c_next - c_next = c_node.next - tree.xmlUnlinkNode(c_node) - _moveTail(c_next, c_node) - if not attemptDeallocation(c_node): - # make namespaces absolute - moveNodeToDocument(doc, c_node.doc, c_node) - return 0 - -cdef int _removeSiblings(xmlNode* c_element, tree.xmlElementType node_type, bint with_tail) except -1: - cdef xmlNode* c_node - cdef xmlNode* c_next - c_node = c_element.next - while c_node is not NULL: - c_next = _nextElement(c_node) - if c_node.type == node_type: - if with_tail: - _removeText(c_node.next) - tree.xmlUnlinkNode(c_node) - attemptDeallocation(c_node) - c_node = c_next - c_node = c_element.prev - while c_node is not NULL: - c_next = _previousElement(c_node) - if c_node.type == node_type: - if with_tail: - _removeText(c_node.next) - tree.xmlUnlinkNode(c_node) - attemptDeallocation(c_node) - c_node = c_next - return 0 - -cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): - cdef xmlNode* c_next - # tail support: look for any text nodes trailing this node and - # move them too - c_tail = _textNodeOrSkip(c_tail) - while c_tail is not NULL: - c_next = _textNodeOrSkip(c_tail.next) - c_target = tree.xmlAddNextSibling(c_target, c_tail) - c_tail = c_next - -cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1: - cdef xmlNode* c_new_tail - # tail copying support: look for any text nodes trailing this node and - # copy it to the target node - c_tail = _textNodeOrSkip(c_tail) - while c_tail is not NULL: - if c_target.doc is not c_tail.doc: - c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0) - else: - c_new_tail = tree.xmlCopyNode(c_tail, 0) - if c_new_tail is NULL: - raise MemoryError() - c_target = tree.xmlAddNextSibling(c_target, c_new_tail) - c_tail = _textNodeOrSkip(c_tail.next) - return 0 - -cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1: - cdef xmlNode* c_copy - cdef xmlNode* c_sibling = c_node - while c_sibling.prev != NULL and \ - (c_sibling.prev.type == tree.XML_PI_NODE or \ - c_sibling.prev.type == tree.XML_COMMENT_NODE): - c_sibling = c_sibling.prev - while c_sibling != c_node: - c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) - if c_copy is NULL: - raise MemoryError() - tree.xmlAddPrevSibling(c_target, c_copy) - c_sibling = c_sibling.next - while c_sibling.next != NULL and \ - (c_sibling.next.type == tree.XML_PI_NODE or \ - c_sibling.next.type == tree.XML_COMMENT_NODE): - c_sibling = c_sibling.next - c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) - if c_copy is NULL: - raise MemoryError() - tree.xmlAddNextSibling(c_target, c_copy) - -cdef int _deleteSlice(_Document doc, xmlNode* c_node, - Py_ssize_t count, Py_ssize_t step) except -1: - u"""Delete slice, ``count`` items starting with ``c_node`` with a step - width of ``step``. - """ - cdef xmlNode* c_next - cdef Py_ssize_t c, i - cdef _node_to_node_function next_element - if c_node is NULL: - return 0 - if step > 0: - next_element = _nextElement - else: - step = -step - next_element = _previousElement - # now start deleting nodes - c = 0 - c_next = c_node - while c_node is not NULL and c < count: - for i in range(step): - c_next = next_element(c_next) - _removeNode(doc, c_node) - c += 1 - c_node = c_next - return 0 - -cdef int _replaceSlice(_Element parent, xmlNode* c_node, - Py_ssize_t slicelength, Py_ssize_t step, - bint left_to_right, elements) except -1: - u"""Replace the slice of ``count`` elements starting at ``c_node`` with - positive step width ``step`` by the Elements in ``elements``. The - direction is given by the boolean argument ``left_to_right``. - - ``c_node`` may be NULL to indicate the end of the children list. - """ - cdef xmlNode* c_orig_neighbour - cdef xmlNode* c_next - cdef xmlDoc* c_source_doc - cdef _Element element - cdef Py_ssize_t seqlength, i, c - cdef _node_to_node_function next_element - assert step > 0 - if left_to_right: - next_element = _nextElement - else: - next_element = _previousElement - - if not isinstance(elements, (list, tuple)): - elements = list(elements) - - if step > 1: - # *replacing* children stepwise with list => check size! - seqlength = len(elements) - if seqlength != slicelength: - raise ValueError, u"attempt to assign sequence of size %d " \ - u"to extended slice of size %d" % (seqlength, slicelength) - - if c_node is NULL: - # no children yet => add all elements straight away - if left_to_right: - for element in elements: - assert element is not None, u"Node must not be None" - _appendChild(parent, element) - else: - for element in elements: - assert element is not None, u"Node must not be None" - _prependChild(parent, element) - return 0 - - # remove the elements first as some might be re-added - if left_to_right: - # L->R, remember left neighbour - c_orig_neighbour = _previousElement(c_node) - else: - # R->L, remember right neighbour - c_orig_neighbour = _nextElement(c_node) - - # We remove the original slice elements one by one. Since we hold - # a Python reference to all elements that we will insert, it is - # safe to let _removeNode() try (and fail) to free them even if - # the element itself or one of its descendents will be reinserted. - c = 0 - c_next = c_node - while c_node is not NULL and c < slicelength: - for i in range(step): - c_next = next_element(c_next) - _removeNode(parent._doc, c_node) - c += 1 - c_node = c_next - - # make sure each element is inserted only once - elements = iter(elements) - - # find the first node right of the new insertion point - if left_to_right: - if c_orig_neighbour is not NULL: - c_node = next_element(c_orig_neighbour) - else: - # before the first element - c_node = _findChildForwards(parent._c_node, 0) - elif c_orig_neighbour is NULL: - # at the end, but reversed stepping - # append one element and go to the next insertion point - for element in elements: - assert element is not None, u"Node must not be None" - _appendChild(parent, element) - c_node = element._c_node - if slicelength > 0: - slicelength -= 1 - for i in range(1, step): - c_node = next_element(c_node) - break - - if left_to_right: - # adjust step size after removing slice as we are not stepping - # over the newly inserted elements - step -= 1 - - # now insert elements where we removed them - if c_node is not NULL: - for element in elements: - assert element is not None, u"Node must not be None" - _assertValidNode(element) - # move element and tail over - c_source_doc = element._c_node.doc - c_next = element._c_node.next - tree.xmlAddPrevSibling(c_node, element._c_node) - _moveTail(c_next, element._c_node) - - # integrate element into new document - moveNodeToDocument(parent._doc, c_source_doc, element._c_node) - - # stop at the end of the slice - if slicelength > 0: - slicelength = slicelength - 1 - for i in range(step): - c_node = next_element(c_node) - if c_node is NULL: - break - else: - # everything inserted - return 0 - - # append the remaining elements at the respective end - if left_to_right: - for element in elements: - assert element is not None, u"Node must not be None" - _assertValidNode(element) - _appendChild(parent, element) - else: - for element in elements: - assert element is not None, u"Node must not be None" - _assertValidNode(element) - _prependChild(parent, element) - - return 0 - -cdef int _appendChild(_Element parent, _Element child) except -1: - u"""Append a new child to a parent element. - """ - c_node = child._c_node - c_source_doc = c_node.doc - # prevent cycles - c_parent = parent._c_node - while c_parent: - if c_parent is c_node: - raise ValueError("cannot append parent to itself") - c_parent = c_parent.parent - # store possible text node - c_next = c_node.next - # move node itself - tree.xmlUnlinkNode(c_node) - tree.xmlAddChild(parent._c_node, c_node) - _moveTail(c_next, c_node) - # uh oh, elements may be pointing to different doc when - # parent element has moved; change them too.. - moveNodeToDocument(parent._doc, c_source_doc, c_node) - return 0 - -cdef int _prependChild(_Element parent, _Element child) except -1: - u"""Prepend a new child to a parent element. - """ - c_node = child._c_node - c_source_doc = c_node.doc - # prevent cycles - c_parent = parent._c_node - while c_parent: - if c_parent is c_node: - raise ValueError("cannot append parent to itself") - c_parent = c_parent.parent - # store possible text node - c_next = c_node.next - # move node itself - c_child = _findChildForwards(parent._c_node, 0) - if c_child is NULL: - tree.xmlUnlinkNode(c_node) - tree.xmlAddChild(parent._c_node, c_node) - else: - tree.xmlAddPrevSibling(c_child, c_node) - _moveTail(c_next, c_node) - # uh oh, elements may be pointing to different doc when - # parent element has moved; change them too.. - moveNodeToDocument(parent._doc, c_source_doc, c_node) - return 0 - -cdef int _appendSibling(_Element element, _Element sibling) except -1: - u"""Add a new sibling behind an element. - """ - c_node = sibling._c_node - if element._c_node is c_node: - return 0 # nothing to do - c_source_doc = c_node.doc - # store possible text node - c_next = c_node.next - # move node itself - tree.xmlAddNextSibling(element._c_node, c_node) - _moveTail(c_next, c_node) - # uh oh, elements may be pointing to different doc when - # parent element has moved; change them too.. - moveNodeToDocument(element._doc, c_source_doc, c_node) - return 0 - -cdef int _prependSibling(_Element element, _Element sibling) except -1: - u"""Add a new sibling before an element. - """ - c_node = sibling._c_node - if element._c_node is c_node: - return 0 # nothing to do - c_source_doc = c_node.doc - # store possible text node - c_next = c_node.next - # move node itself - tree.xmlAddPrevSibling(element._c_node, c_node) - _moveTail(c_next, c_node) - # uh oh, elements may be pointing to different doc when - # parent element has moved; change them too.. - moveNodeToDocument(element._doc, c_source_doc, c_node) - return 0 - -cdef inline int isutf8(const_xmlChar* s): - cdef xmlChar c = s[0] - while c != c'\0': - if c & 0x80: - return 1 - s += 1 - c = s[0] - return 0 - -cdef int check_string_utf8(bytes pystring): - u"""Check if a string looks like valid UTF-8 XML content. Returns 0 - for ASCII, 1 for UTF-8 and -1 in the case of errors, such as NULL - bytes or ASCII control characters. - """ - cdef const_xmlChar* s = _xcstr(pystring) - cdef const_xmlChar* c_end = s + len(pystring) - cdef bint is_non_ascii = 0 - while s < c_end: - if s[0] & 0x80: - # skip over multi byte sequences - while s < c_end and s[0] & 0x80: - s += 1 - is_non_ascii = 1 - if s < c_end and not tree.xmlIsChar_ch(s[0]): - return -1 # invalid! - s += 1 - return is_non_ascii - -cdef inline object funicodeOrNone(const_xmlChar* s): - return funicode(s) if s is not NULL else None - -cdef inline object funicodeOrEmpty(const_xmlChar* s): - return funicode(s) if s is not NULL else '' - -cdef object funicode(const_xmlChar* s): - cdef Py_ssize_t slen - cdef const_xmlChar* spos - cdef bint is_non_ascii - if python.LXML_UNICODE_STRINGS: - return s.decode('UTF-8') - spos = s - is_non_ascii = 0 - while spos[0] != c'\0': - if spos[0] & 0x80: - is_non_ascii = 1 - break - spos += 1 - slen = spos - s - if spos[0] != c'\0': - slen += tree.xmlStrlen(spos) - if is_non_ascii: - return s[:slen].decode('UTF-8') - return s[:slen] - -cdef bytes _utf8(object s): - """Test if a string is valid user input and encode it to UTF-8. - Reject all bytes/unicode input that contains non-XML characters. - Reject all bytes input that contains non-ASCII characters. - """ - cdef int invalid - cdef bytes utf8_string - if not python.IS_PYTHON3 and type(s) is bytes: - utf8_string = s - invalid = check_string_utf8(utf8_string) - elif isinstance(s, unicode): - utf8_string = (s).encode('utf8') - invalid = check_string_utf8(utf8_string) == -1 # non-XML? - elif isinstance(s, (bytes, bytearray)): - utf8_string = bytes(s) - invalid = check_string_utf8(utf8_string) - else: - raise TypeError("Argument must be bytes or unicode, got '%.200s'" % type(s).__name__) - if invalid: - raise ValueError( - "All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters") - return utf8_string - -cdef bytes _utf8orNone(object s): - return _utf8(s) if s is not None else None - -cdef bint _isFilePath(const_xmlChar* c_path): - u"simple heuristic to see if a path is a filename" - cdef xmlChar c - # test if it looks like an absolute Unix path or a Windows network path - if c_path[0] == c'/': - return 1 - - # test if it looks like an absolute Windows path or URL - if (c_path[0] >= c'a' and c_path[0] <= c'z') or \ - (c_path[0] >= c'A' and c_path[0] <= c'Z'): - c_path += 1 - if c_path[0] == c':' and c_path[1] in b'\0\\': - return 1 # C: or C:\... - - # test if it looks like a URL with scheme:// - while (c_path[0] >= c'a' and c_path[0] <= c'z') or \ - (c_path[0] >= c'A' and c_path[0] <= c'Z'): - c_path += 1 - if c_path[0] == c':' and c_path[1] == c'/' and c_path[2] == c'/': - return 0 - - # assume it's a relative path - return 1 - -cdef object _encodeFilename(object filename): - u"""Make sure a filename is 8-bit encoded (or None). - """ - if filename is None: - return None - elif isinstance(filename, bytes): - return filename - elif isinstance(filename, unicode): - filename8 = (filename).encode('utf8') - if _isFilePath(filename8): - try: - return python.PyUnicode_AsEncodedString( - filename, _C_FILENAME_ENCODING, NULL) - except UnicodeEncodeError: - pass - return filename8 - else: - raise TypeError("Argument must be string or unicode.") - -cdef object _decodeFilename(const_xmlChar* c_path): - u"""Make the filename a unicode string if we are in Py3. - """ - return _decodeFilenameWithLength(c_path, tree.xmlStrlen(c_path)) - -cdef object _decodeFilenameWithLength(const_xmlChar* c_path, size_t c_len): - u"""Make the filename a unicode string if we are in Py3. - """ - if _isFilePath(c_path): - try: - return python.PyUnicode_Decode( - c_path, c_len, _C_FILENAME_ENCODING, NULL) - except UnicodeDecodeError: - pass - try: - return (c_path)[:c_len].decode('UTF-8') - except UnicodeDecodeError: - # this is a stupid fallback, but it might still work... - return (c_path)[:c_len].decode('latin-1', 'replace') - -cdef object _encodeFilenameUTF8(object filename): - u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and - UTF-8 as source encoding. - """ - cdef char* c_filename - if filename is None: - return None - elif isinstance(filename, bytes): - if not check_string_utf8(filename): - # plain ASCII! - return filename - c_filename = _cstr(filename) - try: - # try to decode with default encoding - filename = python.PyUnicode_Decode( - c_filename, len(filename), - _C_FILENAME_ENCODING, NULL) - except UnicodeDecodeError as decode_exc: - try: - # try if it's proper UTF-8 - (filename).decode('utf8') - return filename - except UnicodeDecodeError: - raise decode_exc # otherwise re-raise original exception - if isinstance(filename, unicode): - return (filename).encode('utf8') - else: - raise TypeError("Argument must be string or unicode.") - -cdef tuple _getNsTag(tag): - u"""Given a tag, find namespace URI and tag name. - Return None for NS uri if no namespace URI provided. - """ - return __getNsTag(tag, 0) - -cdef tuple _getNsTagWithEmptyNs(tag): - u"""Given a tag, find namespace URI and tag name. Return None for NS uri - if no namespace URI provided, or the empty string if namespace - part is '{}'. - """ - return __getNsTag(tag, 1) - -cdef tuple __getNsTag(tag, bint empty_ns): - cdef char* c_tag - cdef char* c_ns_end - cdef Py_ssize_t taglen - cdef Py_ssize_t nslen - cdef bytes ns = None - # _isString() is much faster than isinstance() - if not _isString(tag) and isinstance(tag, QName): - tag = (tag).text - tag = _utf8(tag) - c_tag = _cstr(tag) - if c_tag[0] == c'{': - c_tag += 1 - c_ns_end = cstring_h.strchr(c_tag, c'}') - if c_ns_end is NULL: - raise ValueError, u"Invalid tag name" - nslen = c_ns_end - c_tag - taglen = python.PyBytes_GET_SIZE(tag) - nslen - 2 - if taglen == 0: - raise ValueError, u"Empty tag name" - if nslen > 0: - ns = c_tag[:nslen] - elif empty_ns: - ns = b'' - tag = c_ns_end[1:taglen+1] - elif python.PyBytes_GET_SIZE(tag) == 0: - raise ValueError, u"Empty tag name" - return ns, tag - -cdef inline int _pyXmlNameIsValid(name_utf8): - return _xmlNameIsValid(_xcstr(name_utf8)) - -cdef inline int _pyHtmlNameIsValid(name_utf8): - return _htmlNameIsValid(_xcstr(name_utf8)) - -cdef inline int _xmlNameIsValid(const_xmlChar* c_name): - return tree.xmlValidateNCName(c_name, 0) == 0 - -cdef int _htmlNameIsValid(const_xmlChar* c_name): - if c_name is NULL or c_name[0] == c'\0': - return 0 - while c_name[0] != c'\0': - if c_name[0] in b'&<>/"\'\t\n\x0B\x0C\r ': - return 0 - c_name += 1 - return 1 - -cdef bint _characterReferenceIsValid(const_xmlChar* c_name): - cdef bint is_hex - if c_name[0] == c'x': - c_name += 1 - is_hex = 1 - else: - is_hex = 0 - if c_name[0] == c'\0': - return 0 - while c_name[0] != c'\0': - if c_name[0] < c'0' or c_name[0] > c'9': - if not is_hex: - return 0 - if not (c'a' <= c_name[0] <= c'f'): - if not (c'A' <= c_name[0] <= c'F'): - return 0 - c_name += 1 - return 1 - -cdef int _tagValidOrRaise(tag_utf) except -1: - if not _pyXmlNameIsValid(tag_utf): - raise ValueError(u"Invalid tag name %r" % - (tag_utf).decode('utf8')) - return 0 - -cdef int _htmlTagValidOrRaise(tag_utf) except -1: - if not _pyHtmlNameIsValid(tag_utf): - raise ValueError(u"Invalid HTML tag name %r" % - (tag_utf).decode('utf8')) - return 0 - -cdef int _attributeValidOrRaise(name_utf) except -1: - if not _pyXmlNameIsValid(name_utf): - raise ValueError(u"Invalid attribute name %r" % - (name_utf).decode('utf8')) - return 0 - -cdef int _prefixValidOrRaise(tag_utf) except -1: - if not _pyXmlNameIsValid(tag_utf): - raise ValueError(u"Invalid namespace prefix %r" % - (tag_utf).decode('utf8')) - return 0 - -cdef int _uriValidOrRaise(uri_utf) except -1: - cdef uri.xmlURI* c_uri = uri.xmlParseURI(_cstr(uri_utf)) - if c_uri is NULL: - raise ValueError(u"Invalid namespace URI %r" % - (uri_utf).decode('utf8')) - uri.xmlFreeURI(c_uri) - return 0 - -cdef inline object _namespacedName(xmlNode* c_node): - return _namespacedNameFromNsName(_getNs(c_node), c_node.name) - -cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name): - if href is NULL: - return funicode(name) - elif python.LXML_UNICODE_STRINGS and not python.IS_PYPY: - return python.PyUnicode_FromFormat("{%s}%s", href, name) - else: - s = python.PyBytes_FromFormat("{%s}%s", href, name) - if python.LXML_UNICODE_STRINGS or isutf8(_xcstr(s)): - return (s).decode('utf8') - else: - return s - -cdef _getFilenameForFile(source): - u"""Given a Python File or Gzip object, give filename back. - - Returns None if not a file object. - """ - # urllib2 provides a geturl() method - try: - return source.geturl() - except: - pass - # file instances have a name attribute - try: - filename = source.name - if _isString(filename): - return os_path_abspath(filename) - except: - pass - # gzip file instances have a filename attribute (before Py3k) - try: - filename = source.filename - if _isString(filename): - return os_path_abspath(filename) - except: - pass - # can't determine filename - return None diff --git a/lib/lxml/builder.py b/lib/lxml/builder.py deleted file mode 100644 index ad61a80e..00000000 --- a/lib/lxml/builder.py +++ /dev/null @@ -1,238 +0,0 @@ -# -# Element generator factory by Fredrik Lundh. -# -# Source: -# http://online.effbot.org/2006_11_01_archive.htm#et-builder -# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py -# -# -------------------------------------------------------------------- -# The ElementTree toolkit is -# -# Copyright (c) 1999-2004 by Fredrik Lundh -# -# By obtaining, using, and/or copying this software and/or its -# associated documentation, you agree that you have read, understood, -# and will comply with the following terms and conditions: -# -# Permission to use, copy, modify, and distribute this software and -# its associated documentation for any purpose and without fee is -# hereby granted, provided that the above copyright notice appears in -# all copies, and that both that copyright notice and this permission -# notice appear in supporting documentation, and that the name of -# Secret Labs AB or the author not be used in advertising or publicity -# pertaining to distribution of the software without specific, written -# prior permission. -# -# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD -# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- -# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR -# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY -# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -# OF THIS SOFTWARE. -# -------------------------------------------------------------------- - -""" -The ``E`` Element factory for generating XML documents. -""" - -import lxml.etree as ET - -try: - from functools import partial -except ImportError: - # fake it for pre-2.5 releases - def partial(func, tag): - return lambda *args, **kwargs: func(tag, *args, **kwargs) - -try: - callable -except NameError: - # Python 3 - def callable(f): - return hasattr(f, '__call__') - -try: - basestring -except NameError: - basestring = str - -try: - unicode -except NameError: - unicode = str - - -class ElementMaker(object): - """Element generator factory. - - Unlike the ordinary Element factory, the E factory allows you to pass in - more than just a tag and some optional attributes; you can also pass in - text and other elements. The text is added as either text or tail - attributes, and elements are inserted at the right spot. Some small - examples:: - - >>> from lxml import etree as ET - >>> from lxml.builder import E - - >>> ET.tostring(E("tag")) - '' - >>> ET.tostring(E("tag", "text")) - 'text' - >>> ET.tostring(E("tag", "text", key="value")) - 'text' - >>> ET.tostring(E("tag", E("subtag", "text"), "tail")) - 'texttail' - - For simple tags, the factory also allows you to write ``E.tag(...)`` instead - of ``E('tag', ...)``:: - - >>> ET.tostring(E.tag()) - '' - >>> ET.tostring(E.tag("text")) - 'text' - >>> ET.tostring(E.tag(E.subtag("text"), "tail")) - 'texttail' - - Here's a somewhat larger example; this shows how to generate HTML - documents, using a mix of prepared factory functions for inline elements, - nested ``E.tag`` calls, and embedded XHTML fragments:: - - # some common inline elements - A = E.a - I = E.i - B = E.b - - def CLASS(v): - # helper function, 'class' is a reserved word - return {'class': v} - - page = ( - E.html( - E.head( - E.title("This is a sample document") - ), - E.body( - E.h1("Hello!", CLASS("title")), - E.p("This is a paragraph with ", B("bold"), " text in it!"), - E.p("This is another paragraph, with a ", - A("link", href="http://www.python.org"), "."), - E.p("Here are some reservered characters: ."), - ET.XML("

And finally, here is an embedded XHTML fragment.

"), - ) - ) - ) - - print ET.tostring(page) - - Here's a prettyprinted version of the output from the above script:: - - - - This is a sample document - - -

Hello!

-

This is a paragraph with bold text in it!

-

This is another paragraph, with link.

-

Here are some reservered characters: <spam&egg>.

-

And finally, here is an embedded XHTML fragment.

- - - - For namespace support, you can pass a namespace map (``nsmap``) - and/or a specific target ``namespace`` to the ElementMaker class:: - - >>> E = ElementMaker(namespace="http://my.ns/") - >>> print(ET.tostring( E.test )) - - - >>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'}) - >>> print(ET.tostring( E.test )) - - """ - - def __init__(self, typemap=None, - namespace=None, nsmap=None, makeelement=None): - if namespace is not None: - self._namespace = '{' + namespace + '}' - else: - self._namespace = None - - if nsmap: - self._nsmap = dict(nsmap) - else: - self._nsmap = None - - if makeelement is not None: - assert callable(makeelement) - self._makeelement = makeelement - else: - self._makeelement = ET.Element - - # initialize type map for this element factory - - if typemap: - typemap = typemap.copy() - else: - typemap = {} - - def add_text(elem, item): - try: - elem[-1].tail = (elem[-1].tail or "") + item - except IndexError: - elem.text = (elem.text or "") + item - if str not in typemap: - typemap[str] = add_text - if unicode not in typemap: - typemap[unicode] = add_text - - def add_dict(elem, item): - attrib = elem.attrib - for k, v in item.items(): - if isinstance(v, basestring): - attrib[k] = v - else: - attrib[k] = typemap[type(v)](None, v) - if dict not in typemap: - typemap[dict] = add_dict - - self._typemap = typemap - - def __call__(self, tag, *children, **attrib): - get = self._typemap.get - - if self._namespace is not None and tag[0] != '{': - tag = self._namespace + tag - elem = self._makeelement(tag, nsmap=self._nsmap) - if attrib: - get(dict)(elem, attrib) - - for item in children: - if callable(item): - item = item() - t = get(type(item)) - if t is None: - if ET.iselement(item): - elem.append(item) - continue - for basetype in type(item).__mro__: - # See if the typemap knows of any of this type's bases. - t = get(basetype) - if t is not None: - break - else: - raise TypeError("bad argument type: %s(%r)" % - (type(item).__name__, item)) - v = t(elem, item) - if v: - get(type(v))(elem, v) - - return elem - - def __getattr__(self, tag): - return partial(self, tag) - -# create factory object -E = ElementMaker() diff --git a/lib/lxml/classlookup.pxi b/lib/lxml/classlookup.pxi deleted file mode 100644 index 82740a51..00000000 --- a/lib/lxml/classlookup.pxi +++ /dev/null @@ -1,565 +0,0 @@ -# Configurable Element class lookup - -################################################################################ -# Custom Element classes - -cdef public class ElementBase(_Element) [ type LxmlElementBaseType, - object LxmlElementBase ]: - u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) - - The public Element class. All custom Element classes must inherit - from this one. To create an Element, use the `Element()` factory. - - BIG FAT WARNING: Subclasses *must not* override __init__ or - __new__ as it is absolutely undefined when these objects will be - created or destroyed. All persistent state of Elements must be - stored in the underlying XML. If you really need to initialize - the object after creation, you can implement an ``_init(self)`` - method that will be called directly after object creation. - - Subclasses of this class can be instantiated to create a new - Element. By default, the tag name will be the class name and the - namespace will be empty. You can modify this with the following - class attributes: - - * TAG - the tag name, possibly containing a namespace in Clark - notation - - * NAMESPACE - the default namespace URI, unless provided as part - of the TAG attribute. - - * HTML - flag if the class is an HTML tag, as opposed to an XML - tag. This only applies to un-namespaced tags and defaults to - false (i.e. XML). - - * PARSER - the parser that provides the configuration for the - newly created document. Providing an HTML parser here will - default to creating an HTML element. - - In user code, the latter three are commonly inherited in class - hierarchies that implement a common namespace. - """ - def __init__(self, *children, attrib=None, nsmap=None, **_extra): - u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) - """ - cdef bint is_html = 0 - cdef _BaseParser parser - cdef _Element last_child - # don't use normal attribute access as it might be overridden - _getattr = object.__getattribute__ - try: - namespace = _utf8(_getattr(self, 'NAMESPACE')) - except AttributeError: - namespace = None - try: - ns, tag = _getNsTag(_getattr(self, 'TAG')) - if ns is not None: - namespace = ns - except AttributeError: - tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__')) - if b'.' in tag: - tag = tag.split(b'.')[-1] - try: - parser = _getattr(self, 'PARSER') - except AttributeError: - parser = None - for child in children: - if isinstance(child, _Element): - parser = (<_Element>child)._doc._parser - break - if isinstance(parser, HTMLParser): - is_html = 1 - if namespace is None: - try: - is_html = _getattr(self, 'HTML') - except AttributeError: - pass - _initNewElement(self, is_html, tag, namespace, parser, - attrib, nsmap, _extra) - last_child = None - for child in children: - if _isString(child): - if last_child is None: - _setNodeText(self._c_node, - (_collectText(self._c_node.children) or '') + child) - else: - _setTailText(last_child._c_node, - (_collectText(last_child._c_node.next) or '') + child) - elif isinstance(child, _Element): - last_child = child - _appendChild(self, last_child) - elif isinstance(child, type) and issubclass(child, ElementBase): - last_child = child() - _appendChild(self, last_child) - else: - raise TypeError, "Invalid child type: %r" % type(child) - -cdef class CommentBase(_Comment): - u"""All custom Comment classes must inherit from this one. - - To create an XML Comment instance, use the ``Comment()`` factory. - - Subclasses *must not* override __init__ or __new__ as it is - absolutely undefined when these objects will be created or - destroyed. All persistent state of Comments must be stored in the - underlying XML. If you really need to initialize the object after - creation, you can implement an ``_init(self)`` method that will be - called after object creation. - """ - def __init__(self, text): - # copied from Comment() factory - cdef _Document doc - cdef xmlDoc* c_doc - if text is None: - text = b'' - else: - text = _utf8(text) - c_doc = _newXMLDoc() - doc = _documentFactory(c_doc, None) - self._c_node = _createComment(c_doc, _xcstr(text)) - if self._c_node is NULL: - raise MemoryError() - tree.xmlAddChild(c_doc, self._c_node) - _registerProxy(self, doc, self._c_node) - self._init() - -cdef class PIBase(_ProcessingInstruction): - u"""All custom Processing Instruction classes must inherit from this one. - - To create an XML ProcessingInstruction instance, use the ``PI()`` - factory. - - Subclasses *must not* override __init__ or __new__ as it is - absolutely undefined when these objects will be created or - destroyed. All persistent state of PIs must be stored in the - underlying XML. If you really need to initialize the object after - creation, you can implement an ``_init(self)`` method that will be - called after object creation. - """ - def __init__(self, target, text=None): - # copied from PI() factory - cdef _Document doc - cdef xmlDoc* c_doc - target = _utf8(target) - if text is None: - text = b'' - else: - text = _utf8(text) - c_doc = _newXMLDoc() - doc = _documentFactory(c_doc, None) - self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text)) - if self._c_node is NULL: - raise MemoryError() - tree.xmlAddChild(c_doc, self._c_node) - _registerProxy(self, doc, self._c_node) - self._init() - -cdef class EntityBase(_Entity): - u"""All custom Entity classes must inherit from this one. - - To create an XML Entity instance, use the ``Entity()`` factory. - - Subclasses *must not* override __init__ or __new__ as it is - absolutely undefined when these objects will be created or - destroyed. All persistent state of Entities must be stored in the - underlying XML. If you really need to initialize the object after - creation, you can implement an ``_init(self)`` method that will be - called after object creation. - """ - def __init__(self, name): - cdef _Document doc - cdef xmlDoc* c_doc - name_utf = _utf8(name) - c_name = _xcstr(name_utf) - if c_name[0] == c'#': - if not _characterReferenceIsValid(c_name + 1): - raise ValueError, u"Invalid character reference: '%s'" % name - elif not _xmlNameIsValid(c_name): - raise ValueError, u"Invalid entity reference: '%s'" % name - c_doc = _newXMLDoc() - doc = _documentFactory(c_doc, None) - self._c_node = _createEntity(c_doc, c_name) - if self._c_node is NULL: - raise MemoryError() - tree.xmlAddChild(c_doc, self._c_node) - _registerProxy(self, doc, self._c_node) - self._init() - - -cdef int _validateNodeClass(xmlNode* c_node, cls) except -1: - if c_node.type == tree.XML_ELEMENT_NODE: - expected = ElementBase - elif c_node.type == tree.XML_COMMENT_NODE: - expected = CommentBase - elif c_node.type == tree.XML_ENTITY_REF_NODE: - expected = EntityBase - elif c_node.type == tree.XML_PI_NODE: - expected = PIBase - else: - assert 0, u"Unknown node type: %s" % c_node.type - - if not (isinstance(cls, type) and issubclass(cls, expected)): - raise TypeError( - "result of class lookup must be subclass of %s, got %s" - % (type(expected), type(cls))) - return 0 - - -################################################################################ -# Element class lookup - -ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*) - -# class to store element class lookup functions -cdef public class ElementClassLookup [ type LxmlElementClassLookupType, - object LxmlElementClassLookup ]: - u"""ElementClassLookup(self) - Superclass of Element class lookups. - """ - cdef _element_class_lookup_function _lookup_function - def __cinit__(self): - self._lookup_function = NULL # use default lookup - -cdef public class FallbackElementClassLookup(ElementClassLookup) \ - [ type LxmlFallbackElementClassLookupType, - object LxmlFallbackElementClassLookup ]: - u"""FallbackElementClassLookup(self, fallback=None) - - Superclass of Element class lookups with additional fallback. - """ - cdef readonly ElementClassLookup fallback - cdef _element_class_lookup_function _fallback_function - def __cinit__(self): - # fall back to default lookup - self._fallback_function = _lookupDefaultElementClass - - def __init__(self, ElementClassLookup fallback=None): - if fallback is not None: - self._setFallback(fallback) - else: - self._fallback_function = _lookupDefaultElementClass - - cdef void _setFallback(self, ElementClassLookup lookup): - u"""Sets the fallback scheme for this lookup method. - """ - self.fallback = lookup - self._fallback_function = lookup._lookup_function - if self._fallback_function is NULL: - self._fallback_function = _lookupDefaultElementClass - - def set_fallback(self, ElementClassLookup lookup not None): - u"""set_fallback(self, lookup) - - Sets the fallback scheme for this lookup method. - """ - self._setFallback(lookup) - -cdef inline object _callLookupFallback(FallbackElementClassLookup lookup, - _Document doc, xmlNode* c_node): - return lookup._fallback_function(lookup.fallback, doc, c_node) - - -################################################################################ -# default lookup scheme - -cdef class ElementDefaultClassLookup(ElementClassLookup): - u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None) - Element class lookup scheme that always returns the default Element - class. - - The keyword arguments ``element``, ``comment``, ``pi`` and ``entity`` - accept the respective Element classes. - """ - cdef readonly object element_class - cdef readonly object comment_class - cdef readonly object pi_class - cdef readonly object entity_class - def __cinit__(self): - self._lookup_function = _lookupDefaultElementClass - - def __init__(self, element=None, comment=None, pi=None, entity=None): - if element is None: - self.element_class = _Element - elif issubclass(element, ElementBase): - self.element_class = element - else: - raise TypeError, u"element class must be subclass of ElementBase" - - if comment is None: - self.comment_class = _Comment - elif issubclass(comment, CommentBase): - self.comment_class = comment - else: - raise TypeError, u"comment class must be subclass of CommentBase" - - if entity is None: - self.entity_class = _Entity - elif issubclass(entity, EntityBase): - self.entity_class = entity - else: - raise TypeError, u"Entity class must be subclass of EntityBase" - - if pi is None: - self.pi_class = None # special case, see below - elif issubclass(pi, PIBase): - self.pi_class = pi - else: - raise TypeError, u"PI class must be subclass of PIBase" - -cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node): - u"Trivial class lookup function that always returns the default class." - if c_node.type == tree.XML_ELEMENT_NODE: - if state is not None: - return (state).element_class - else: - return _Element - elif c_node.type == tree.XML_COMMENT_NODE: - if state is not None: - return (state).comment_class - else: - return _Comment - elif c_node.type == tree.XML_ENTITY_REF_NODE: - if state is not None: - return (state).entity_class - else: - return _Entity - elif c_node.type == tree.XML_PI_NODE: - if state is None or (state).pi_class is None: - # special case XSLT-PI - if c_node.name is not NULL and c_node.content is not NULL: - if tree.xmlStrcmp(c_node.name, "xml-stylesheet") == 0: - if tree.xmlStrstr(c_node.content, "text/xsl") is not NULL or \ - tree.xmlStrstr(c_node.content, "text/xml") is not NULL: - return _XSLTProcessingInstruction - return _ProcessingInstruction - else: - return (state).pi_class - else: - assert 0, u"Unknown node type: %s" % c_node.type - - -################################################################################ -# attribute based lookup scheme - -cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup): - u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None) - Checks an attribute of an Element and looks up the value in a - class dictionary. - - Arguments: - - attribute name - '{ns}name' style string - - class mapping - Python dict mapping attribute values to Element classes - - fallback - optional fallback lookup mechanism - - A None key in the class mapping will be checked if the attribute is - missing. - """ - cdef object _class_mapping - cdef tuple _pytag - cdef const_xmlChar* _c_ns - cdef const_xmlChar* _c_name - def __cinit__(self): - self._lookup_function = _attribute_class_lookup - - def __init__(self, attribute_name, class_mapping, - ElementClassLookup fallback=None): - self._pytag = _getNsTag(attribute_name) - ns, name = self._pytag - if ns is None: - self._c_ns = NULL - else: - self._c_ns = _xcstr(ns) - self._c_name = _xcstr(name) - self._class_mapping = dict(class_mapping) - - FallbackElementClassLookup.__init__(self, fallback) - -cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node): - cdef AttributeBasedElementClassLookup lookup - cdef python.PyObject* dict_result - - lookup = state - if c_node.type == tree.XML_ELEMENT_NODE: - value = _attributeValueFromNsName( - c_node, lookup._c_ns, lookup._c_name) - dict_result = python.PyDict_GetItem(lookup._class_mapping, value) - if dict_result is not NULL: - cls = dict_result - _validateNodeClass(c_node, cls) - return cls - return _callLookupFallback(lookup, doc, c_node) - - -################################################################################ -# per-parser lookup scheme - -cdef class ParserBasedElementClassLookup(FallbackElementClassLookup): - u"""ParserBasedElementClassLookup(self, fallback=None) - Element class lookup based on the XML parser. - """ - def __cinit__(self): - self._lookup_function = _parser_class_lookup - -cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node): - if doc._parser._class_lookup is not None: - return doc._parser._class_lookup._lookup_function( - doc._parser._class_lookup, doc, c_node) - return _callLookupFallback(state, doc, c_node) - - -################################################################################ -# custom class lookup based on node type, namespace, name - -cdef class CustomElementClassLookup(FallbackElementClassLookup): - u"""CustomElementClassLookup(self, fallback=None) - Element class lookup based on a subclass method. - - You can inherit from this class and override the method:: - - lookup(self, type, doc, namespace, name) - - to lookup the element class for a node. Arguments of the method: - * type: one of 'element', 'comment', 'PI', 'entity' - * doc: document that the node is in - * namespace: namespace URI of the node (or None for comments/PIs/entities) - * name: name of the element/entity, None for comments, target for PIs - - If you return None from this method, the fallback will be called. - """ - def __cinit__(self): - self._lookup_function = _custom_class_lookup - - def lookup(self, type, doc, namespace, name): - u"lookup(self, type, doc, namespace, name)" - return None - -cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node): - cdef CustomElementClassLookup lookup - - lookup = state - - if c_node.type == tree.XML_ELEMENT_NODE: - element_type = u"element" - elif c_node.type == tree.XML_COMMENT_NODE: - element_type = u"comment" - elif c_node.type == tree.XML_PI_NODE: - element_type = u"PI" - elif c_node.type == tree.XML_ENTITY_REF_NODE: - element_type = u"entity" - else: - element_type = u"element" - if c_node.name is NULL: - name = None - else: - name = funicode(c_node.name) - c_str = tree._getNs(c_node) - ns = funicode(c_str) if c_str is not NULL else None - - cls = lookup.lookup(element_type, doc, ns, name) - if cls is not None: - _validateNodeClass(c_node, cls) - return cls - return _callLookupFallback(lookup, doc, c_node) - - -################################################################################ -# read-only tree based class lookup - -cdef class PythonElementClassLookup(FallbackElementClassLookup): - u"""PythonElementClassLookup(self, fallback=None) - Element class lookup based on a subclass method. - - This class lookup scheme allows access to the entire XML tree in - read-only mode. To use it, re-implement the ``lookup(self, doc, - root)`` method in a subclass:: - - from lxml import etree, pyclasslookup - - class MyElementClass(etree.ElementBase): - honkey = True - - class MyLookup(pyclasslookup.PythonElementClassLookup): - def lookup(self, doc, root): - if root.tag == "sometag": - return MyElementClass - else: - for child in root: - if child.tag == "someothertag": - return MyElementClass - # delegate to default - return None - - If you return None from this method, the fallback will be called. - - The first argument is the opaque document instance that contains - the Element. The second argument is a lightweight Element proxy - implementation that is only valid during the lookup. Do not try - to keep a reference to it. Once the lookup is done, the proxy - will be invalid. - - Also, you cannot wrap such a read-only Element in an ElementTree, - and you must take care not to keep a reference to them outside of - the `lookup()` method. - - Note that the API of the Element objects is not complete. It is - purely read-only and does not support all features of the normal - `lxml.etree` API (such as XPath, extended slicing or some - iteration methods). - - See http://codespeak.net/lxml/element_classes.html - """ - def __cinit__(self): - self._lookup_function = _python_class_lookup - - def lookup(self, doc, element): - u"""lookup(self, doc, element) - - Override this method to implement your own lookup scheme. - """ - return None - -cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node): - cdef PythonElementClassLookup lookup - cdef _ReadOnlyElementProxy proxy - lookup = state - - proxy = _newReadOnlyProxy(None, c_node) - cls = lookup.lookup(doc, proxy) - _freeReadOnlyProxies(proxy) - - if cls is not None: - _validateNodeClass(c_node, cls) - return cls - return _callLookupFallback(lookup, doc, c_node) - -################################################################################ -# Global setup - -cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS -cdef object ELEMENT_CLASS_LOOKUP_STATE - -cdef void _setElementClassLookupFunction( - _element_class_lookup_function function, object state): - global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE - if function is NULL: - state = DEFAULT_ELEMENT_CLASS_LOOKUP - function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function - - ELEMENT_CLASS_LOOKUP_STATE = state - LOOKUP_ELEMENT_CLASS = function - -def set_element_class_lookup(ElementClassLookup lookup = None): - u"""set_element_class_lookup(lookup = None) - - Set the global default element class lookup method. - """ - if lookup is None or lookup._lookup_function is NULL: - _setElementClassLookupFunction(NULL, None) - else: - _setElementClassLookupFunction(lookup._lookup_function, lookup) - -# default setup: parser delegation -cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP -DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup() - -set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP) diff --git a/lib/lxml/cleanup.pxi b/lib/lxml/cleanup.pxi deleted file mode 100644 index 5388ffb6..00000000 --- a/lib/lxml/cleanup.pxi +++ /dev/null @@ -1,210 +0,0 @@ -# functions for tree cleanup and removing elements from subtrees - -def cleanup_namespaces(tree_or_element): - u"""cleanup_namespaces(tree_or_element) - - Remove all namespace declarations from a subtree that are not used - by any of the elements or attributes in that tree. - """ - cdef _Element element - element = _rootNodeOrRaise(tree_or_element) - _removeUnusedNamespaceDeclarations(element._c_node) - -def strip_attributes(tree_or_element, *attribute_names): - u"""strip_attributes(tree_or_element, *attribute_names) - - Delete all attributes with the provided attribute names from an - Element (or ElementTree) and its descendants. - - Attribute names can contain wildcards as in `_Element.iter`. - - Example usage:: - - strip_attributes(root_element, - 'simpleattr', - '{http://some/ns}attrname', - '{http://other/ns}*') - """ - cdef _MultiTagMatcher matcher - cdef _Element element - - element = _rootNodeOrRaise(tree_or_element) - if not attribute_names: - return - - matcher = _MultiTagMatcher(attribute_names) - matcher.cacheTags(element._doc) - if matcher.rejectsAllAttributes(): - return - _strip_attributes(element._c_node, matcher) - -cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher): - cdef xmlAttr* c_attr - cdef xmlAttr* c_next_attr - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - c_attr = c_node.properties - while c_attr is not NULL: - c_next_attr = c_attr.next - if matcher.matchesAttribute(c_attr): - tree.xmlRemoveProp(c_attr) - c_attr = c_next_attr - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - -def strip_elements(tree_or_element, *tag_names, bint with_tail=True): - u"""strip_elements(tree_or_element, *tag_names, with_tail=True) - - Delete all elements with the provided tag names from a tree or - subtree. This will remove the elements and their entire subtree, - including all their attributes, text content and descendants. It - will also remove the tail text of the element unless you - explicitly set the ``with_tail`` keyword argument option to False. - - Tag names can contain wildcards as in `_Element.iter`. - - Note that this will not delete the element (or ElementTree root - element) that you passed even if it matches. It will only treat - its descendants. If you want to include the root element, check - its tag name directly before even calling this function. - - Example usage:: - - strip_elements(some_element, - 'simpletagname', # non-namespaced tag - '{http://some/ns}tagname', # namespaced tag - '{http://some/other/ns}*' # any tag from a namespace - lxml.etree.Comment # comments - ) - """ - cdef _MultiTagMatcher matcher - cdef _Element element - cdef _Document doc - cdef list ns_tags - cdef qname* c_ns_tags - cdef Py_ssize_t c_tag_count - cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0 - - doc = _documentOrRaise(tree_or_element) - element = _rootNodeOrRaise(tree_or_element) - if not tag_names: - return - - matcher = _MultiTagMatcher(tag_names) - matcher.cacheTags(doc) - if matcher.rejectsAll(): - return - - if isinstance(tree_or_element, _ElementTree): - # include PIs and comments next to the root node - if matcher.matchesType(tree.XML_COMMENT_NODE): - _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail) - if matcher.matchesType(tree.XML_PI_NODE): - _removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail) - _strip_elements(doc, element._c_node, matcher, with_tail) - -cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher, - bint with_tail): - cdef xmlNode* c_child - cdef xmlNode* c_next - - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - # we run through the children here to prevent any problems - # with the tree iteration which would occur if we unlinked the - # c_node itself - c_child = _findChildForwards(c_node, 0) - while c_child is not NULL: - c_next = _nextElement(c_child) - if matcher.matches(c_child): - if c_child.type == tree.XML_ELEMENT_NODE: - if not with_tail: - tree.xmlUnlinkNode(c_child) - _removeNode(doc, c_child) - else: - if with_tail: - _removeText(c_child.next) - tree.xmlUnlinkNode(c_child) - attemptDeallocation(c_child) - c_child = c_next - tree.END_FOR_EACH_ELEMENT_FROM(c_node) - - -def strip_tags(tree_or_element, *tag_names): - u"""strip_tags(tree_or_element, *tag_names) - - Delete all elements with the provided tag names from a tree or - subtree. This will remove the elements and their attributes, but - *not* their text/tail content or descendants. Instead, it will - merge the text content and children of the element into its - parent. - - Tag names can contain wildcards as in `_Element.iter`. - - Note that this will not delete the element (or ElementTree root - element) that you passed even if it matches. It will only treat - its descendants. - - Example usage:: - - strip_tags(some_element, - 'simpletagname', # non-namespaced tag - '{http://some/ns}tagname', # namespaced tag - '{http://some/other/ns}*' # any tag from a namespace - Comment # comments (including their text!) - ) - """ - cdef _MultiTagMatcher matcher - cdef _Element element - cdef _Document doc - cdef list ns_tags - cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0 - cdef char** c_ns_tags - cdef Py_ssize_t c_tag_count - - doc = _documentOrRaise(tree_or_element) - element = _rootNodeOrRaise(tree_or_element) - if not tag_names: - return - - matcher = _MultiTagMatcher(tag_names) - matcher.cacheTags(doc) - if matcher.rejectsAll(): - return - - if isinstance(tree_or_element, _ElementTree): - # include PIs and comments next to the root node - if matcher.matchesType(tree.XML_COMMENT_NODE): - _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0) - if matcher.matchesType(tree.XML_PI_NODE): - _removeSiblings(element._c_node, tree.XML_PI_NODE, 0) - _strip_tags(doc, element._c_node, matcher) - -cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher): - cdef xmlNode* c_child - cdef xmlNode* c_next - cdef Py_ssize_t i - - tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) - if c_node.type == tree.XML_ELEMENT_NODE: - # we run through the children here to prevent any problems - # with the tree iteration which would occur if we unlinked the - # c_node itself - c_child = _findChildForwards(c_node, 0) - while c_child is not NULL: - if not matcher.matches(c_child): - c_child = _nextElement(c_child) - continue - if c_child.type == tree.XML_ELEMENT_NODE: - c_next = _findChildForwards(c_child, 0) or _nextElement(c_child) - _replaceNodeByChildren(doc, c_child) - if not attemptDeallocation(c_child): - if c_child.nsDef is not NULL: - # make namespaces absolute - moveNodeToDocument(doc, doc._c_doc, c_child) - c_child = c_next - else: - c_next = _nextElement(c_child) - tree.xmlUnlinkNode(c_child) - attemptDeallocation(c_child) - c_child = c_next - tree.END_FOR_EACH_ELEMENT_FROM(c_node) diff --git a/lib/lxml/cssselect.py b/lib/lxml/cssselect.py deleted file mode 100644 index e8effaa2..00000000 --- a/lib/lxml/cssselect.py +++ /dev/null @@ -1,103 +0,0 @@ -"""CSS Selectors based on XPath. - -This module supports selecting XML/HTML tags based on CSS selectors. -See the `CSSSelector` class for details. - -This is a thin wrapper around cssselect 0.7 or later. -""" - -import sys -from lxml import etree - -## Work-around the lack of absolute import in Python 2.4 -#from __future__ import absolute_import -#from cssselect import ... -try: - external_cssselect = __import__('cssselect') -except ImportError: - raise ImportError('cssselect seems not to be installed. ' - 'See http://packages.python.org/cssselect/') - -SelectorSyntaxError = external_cssselect.SelectorSyntaxError -ExpressionError = external_cssselect.ExpressionError -SelectorError = external_cssselect.SelectorError - - -__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError', - 'CSSSelector'] - - -class LxmlTranslator(external_cssselect.GenericTranslator): - """ - A custom CSS selector to XPath translator with lxml-specific extensions. - """ - def xpath_contains_function(self, xpath, function): - # Defined there, removed in later drafts: - # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors - if function.argument_types() not in (['STRING'], ['IDENT']): - raise ExpressionError( - "Expected a single string or ident for :contains(), got %r" - % function.arguments) - value = function.arguments[0].value - return xpath.add_condition( - 'contains(__lxml_internal_css:lower-case(string(.)), %s)' - % self.xpath_literal(value.lower())) - - -class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator): - """ - lxml extensions + HTML support. - """ - - -def _make_lower_case(context, s): - return s.lower() - -ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') -ns.prefix = '__lxml_internal_css' -ns['lower-case'] = _make_lower_case - - -class CSSSelector(etree.XPath): - """A CSS selector. - - Usage:: - - >>> from lxml import etree, cssselect - >>> select = cssselect.CSSSelector("a tag > child") - - >>> root = etree.XML("TEXT") - >>> [ el.tag for el in select(root) ] - ['child'] - - To use CSS namespaces, you need to pass a prefix-to-namespace - mapping as ``namespaces`` keyword argument:: - - >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' - >>> select_ns = cssselect.CSSSelector('root > rdf|Description', - ... namespaces={'rdf': rdfns}) - - >>> rdf = etree.XML(( - ... '' - ... 'blah' - ... '') % rdfns) - >>> [(el.tag, el.text) for el in select_ns(rdf)] - [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] - - """ - def __init__(self, css, namespaces=None, translator='xml'): - if translator == 'xml': - translator = LxmlTranslator() - elif translator == 'html': - translator = LxmlHTMLTranslator() - elif translator == 'xhtml': - translator = LxmlHTMLTranslator(xhtml=True) - path = translator.css_to_xpath(css) - etree.XPath.__init__(self, path, namespaces=namespaces) - self.css = css - - def __repr__(self): - return '<%s %s for %r>' % ( - self.__class__.__name__, - hex(abs(id(self)))[2:], - self.css) diff --git a/lib/lxml/cvarargs.pxd b/lib/lxml/cvarargs.pxd deleted file mode 100644 index 824c1f0c..00000000 --- a/lib/lxml/cvarargs.pxd +++ /dev/null @@ -1,8 +0,0 @@ -cdef extern from "stdarg.h": - ctypedef void *va_list - void va_start(va_list ap, void *last) nogil - void va_end(va_list ap) nogil - -cdef extern from "etree_defs.h": - cdef int va_int(va_list ap) nogil - cdef char *va_charptr(va_list ap) nogil diff --git a/lib/lxml/debug.pxi b/lib/lxml/debug.pxi deleted file mode 100644 index 47b8497b..00000000 --- a/lib/lxml/debug.pxi +++ /dev/null @@ -1,91 +0,0 @@ - -@cython.final -@cython.internal -cdef class _MemDebug: - """Debugging support for the memory allocation in libxml2. - """ - def bytes_used(self): - """bytes_used(self) - - Returns the total amount of memory (in bytes) currently used by libxml2. - Note that libxml2 constrains this value to a C int, which limits - the accuracy on 64 bit systems. - """ - return tree.xmlMemUsed() - - def blocks_used(self): - """blocks_used(self) - - Returns the total number of memory blocks currently allocated by libxml2. - Note that libxml2 constrains this value to a C int, which limits - the accuracy on 64 bit systems. - """ - return tree.xmlMemBlocks() - - def dict_size(self): - """dict_size(self) - - Returns the current size of the global name dictionary used by libxml2 - for the current thread. Each thread has its own dictionary. - """ - c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) - if c_dict is NULL: - raise MemoryError() - return tree.xmlDictSize(c_dict) - - def dump(self, output_file=None, byte_count=None): - """dump(self, output_file=None, byte_count=None) - - Dumps the current memory blocks allocated by libxml2 to a file. - - The optional parameter 'output_file' specifies the file path. It defaults - to the file ".memorylist" in the current directory. - - The optional parameter 'byte_count' limits the number of bytes in the dump. - Note that this parameter is ignored when lxml is compiled against a libxml2 - version before 2.7.0. - """ - cdef Py_ssize_t c_count - if output_file is None: - output_file = b'.memorylist' - elif isinstance(output_file, unicode): - output_file.encode(sys.getfilesystemencoding()) - - f = stdio.fopen(output_file, "w") - if f is NULL: - raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding())) - try: - if byte_count is None: - tree.xmlMemDisplay(f) - else: - c_count = byte_count - tree.xmlMemDisplayLast(f, c_count) - finally: - stdio.fclose(f) - - def show(self, output_file=None, block_count=None): - """show(self, output_file=None, block_count=None) - - Dumps the current memory blocks allocated by libxml2 to a file. - The output file format is suitable for line diffing. - - The optional parameter 'output_file' specifies the file path. It defaults - to the file ".memorydump" in the current directory. - - The optional parameter 'block_count' limits the number of blocks - in the dump. - """ - if output_file is None: - output_file = b'.memorydump' - elif isinstance(output_file, unicode): - output_file.encode(sys.getfilesystemencoding()) - - f = stdio.fopen(output_file, "w") - if f is NULL: - raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding())) - try: - tree.xmlMemShow(f, block_count if block_count is not None else tree.xmlMemBlocks()) - finally: - stdio.fclose(f) - -memory_debugger = _MemDebug() diff --git a/lib/lxml/docloader.pxi b/lib/lxml/docloader.pxi deleted file mode 100644 index dd515b7f..00000000 --- a/lib/lxml/docloader.pxi +++ /dev/null @@ -1,175 +0,0 @@ -# Custom resolver API - -ctypedef enum _InputDocumentDataType: - PARSER_DATA_INVALID - PARSER_DATA_EMPTY - PARSER_DATA_STRING - PARSER_DATA_FILENAME - PARSER_DATA_FILE - -@cython.final -@cython.internal -cdef class _InputDocument: - cdef _InputDocumentDataType _type - cdef bytes _data_bytes - cdef object _filename - cdef object _file - cdef bint _close_file - - def __cinit__(self): - self._type = PARSER_DATA_INVALID - - -cdef class Resolver: - u"This is the base class of all resolvers." - def resolve(self, system_url, public_id, context): - u"""resolve(self, system_url, public_id, context) - - Override this method to resolve an external source by - ``system_url`` and ``public_id``. The third argument is an - opaque context object. - - Return the result of one of the ``resolve_*()`` methods. - """ - return None - - def resolve_empty(self, context): - u"""resolve_empty(self, context) - - Return an empty input document. - - Pass context as parameter. - """ - cdef _InputDocument doc_ref - doc_ref = _InputDocument() - doc_ref._type = PARSER_DATA_EMPTY - return doc_ref - - def resolve_string(self, string, context, *, base_url=None): - u"""resolve_string(self, string, context, base_url=None) - - Return a parsable string as input document. - - Pass data string and context as parameters. You can pass the - source URL or filename through the ``base_url`` keyword - argument. - """ - cdef _InputDocument doc_ref - if isinstance(string, unicode): - string = (string).encode('utf8') - elif not isinstance(string, bytes): - raise TypeError, "argument must be a byte string or unicode string" - doc_ref = _InputDocument() - doc_ref._type = PARSER_DATA_STRING - doc_ref._data_bytes = string - if base_url is not None: - doc_ref._filename = _encodeFilename(base_url) - return doc_ref - - def resolve_filename(self, filename, context): - u"""resolve_filename(self, filename, context) - - Return the name of a parsable file as input document. - - Pass filename and context as parameters. You can also pass a - URL with an HTTP, FTP or file target. - """ - cdef _InputDocument doc_ref - doc_ref = _InputDocument() - doc_ref._type = PARSER_DATA_FILENAME - doc_ref._filename = _encodeFilename(filename) - return doc_ref - - def resolve_file(self, f, context, *, base_url=None, bint close=True): - u"""resolve_file(self, f, context, base_url=None, close=True) - - Return an open file-like object as input document. - - Pass open file and context as parameters. You can pass the - base URL or filename of the file through the ``base_url`` - keyword argument. If the ``close`` flag is True (the - default), the file will be closed after reading. - - Note that using ``.resolve_filename()`` is more efficient, - especially in threaded environments. - """ - cdef _InputDocument doc_ref - try: - f.read - except AttributeError: - raise TypeError, u"Argument is not a file-like object" - doc_ref = _InputDocument() - doc_ref._type = PARSER_DATA_FILE - if base_url is not None: - doc_ref._filename = _encodeFilename(base_url) - else: - doc_ref._filename = _getFilenameForFile(f) - doc_ref._close_file = close - doc_ref._file = f - return doc_ref - -@cython.final -@cython.internal -cdef class _ResolverRegistry: - cdef object _resolvers - cdef Resolver _default_resolver - def __cinit__(self, Resolver default_resolver=None): - self._resolvers = set() - self._default_resolver = default_resolver - - def add(self, Resolver resolver not None): - u"""add(self, resolver) - - Register a resolver. - - For each requested entity, the 'resolve' method of the resolver will - be called and the result will be passed to the parser. If this method - returns None, the request will be delegated to other resolvers or the - default resolver. The resolvers will be tested in an arbitrary order - until the first match is found. - """ - self._resolvers.add(resolver) - - def remove(self, resolver): - u"remove(self, resolver)" - self._resolvers.discard(resolver) - - cdef _ResolverRegistry _copy(self): - cdef _ResolverRegistry registry - registry = _ResolverRegistry(self._default_resolver) - registry._resolvers = self._resolvers.copy() - return registry - - def copy(self): - u"copy(self)" - return self._copy() - - def resolve(self, system_url, public_id, context): - u"resolve(self, system_url, public_id, context)" - for resolver in self._resolvers: - result = resolver.resolve(system_url, public_id, context) - if result is not None: - return result - if self._default_resolver is None: - return None - return self._default_resolver.resolve(system_url, public_id, context) - - def __repr__(self): - return repr(self._resolvers) - -@cython.internal -cdef class _ResolverContext(_ExceptionContext): - cdef _ResolverRegistry _resolvers - cdef _TempStore _storage - - cdef void clear(self): - _ExceptionContext.clear(self) - self._storage.clear() - -cdef _initResolverContext(_ResolverContext context, - _ResolverRegistry resolvers): - if resolvers is None: - context._resolvers = _ResolverRegistry() - else: - context._resolvers = resolvers - context._storage = _TempStore() diff --git a/lib/lxml/doctestcompare.py b/lib/lxml/doctestcompare.py deleted file mode 100644 index 3cd5ce48..00000000 --- a/lib/lxml/doctestcompare.py +++ /dev/null @@ -1,505 +0,0 @@ -""" -lxml-based doctest output comparison. - -Note: normally, you should just import the `lxml.usedoctest` and -`lxml.html.usedoctest` modules from within a doctest, instead of this -one:: - - >>> import lxml.usedoctest # for XML output - - >>> import lxml.html.usedoctest # for HTML output - -To use this module directly, you must call ``lxmldoctest.install()``, -which will cause doctest to use this in all subsequent calls. - -This changes the way output is checked and comparisons are made for -XML or HTML-like content. - -XML or HTML content is noticed because the example starts with ``<`` -(it's HTML if it starts with ```` or include an ``any`` -attribute in the tag. An ``any`` tag matches any tag, while the -attribute matches any and all attributes. - -When a match fails, the reformatted example and gotten text is -displayed (indented), and a rough diff-like output is given. Anything -marked with ``-`` is in the output but wasn't supposed to be, and -similarly ``+`` means its in the example but wasn't in the output. - -You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` -""" - -from lxml import etree -import sys -import re -import doctest -import cgi - -__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', - 'LHTMLOutputChecker', 'install', 'temp_install'] - -try: - _basestring = basestring -except NameError: - _basestring = (str, bytes) - -_IS_PYTHON_3 = sys.version_info[0] >= 3 - -PARSE_HTML = doctest.register_optionflag('PARSE_HTML') -PARSE_XML = doctest.register_optionflag('PARSE_XML') -NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') - -OutputChecker = doctest.OutputChecker - -def strip(v): - if v is None: - return None - else: - return v.strip() - -def norm_whitespace(v): - return _norm_whitespace_re.sub(' ', v) - -_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) - -def html_fromstring(html): - return etree.fromstring(html, _html_parser) - -# We use this to distinguish repr()s from elements: -_repr_re = re.compile(r'^<[^>]+ (at|object) ') -_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') - -class LXMLOutputChecker(OutputChecker): - - empty_tags = ( - 'param', 'img', 'area', 'br', 'basefont', 'input', - 'base', 'meta', 'link', 'col') - - def get_default_parser(self): - return etree.XML - - def check_output(self, want, got, optionflags): - alt_self = getattr(self, '_temp_override_self', None) - if alt_self is not None: - super_method = self._temp_call_super_check_output - self = alt_self - else: - super_method = OutputChecker.check_output - parser = self.get_parser(want, got, optionflags) - if not parser: - return super_method( - self, want, got, optionflags) - try: - want_doc = parser(want) - except etree.XMLSyntaxError: - return False - try: - got_doc = parser(got) - except etree.XMLSyntaxError: - return False - return self.compare_docs(want_doc, got_doc) - - def get_parser(self, want, got, optionflags): - parser = None - if NOPARSE_MARKUP & optionflags: - return None - if PARSE_HTML & optionflags: - parser = html_fromstring - elif PARSE_XML & optionflags: - parser = etree.XML - elif (want.strip().lower().startswith('' % el.tag - return '<%s %s>' % (el.tag, ' '.join(attrs)) - - def format_end_tag(self, el): - if isinstance(el, etree.CommentBase): - # FIXME: probably PIs should be handled specially too? - return '-->' - return '' % el.tag - - def collect_diff(self, want, got, html, indent): - parts = [] - if not len(want) and not len(got): - parts.append(' '*indent) - parts.append(self.collect_diff_tag(want, got)) - if not self.html_empty_tag(got, html): - parts.append(self.collect_diff_text(want.text, got.text)) - parts.append(self.collect_diff_end_tag(want, got)) - parts.append(self.collect_diff_text(want.tail, got.tail)) - parts.append('\n') - return ''.join(parts) - parts.append(' '*indent) - parts.append(self.collect_diff_tag(want, got)) - parts.append('\n') - if strip(want.text) or strip(got.text): - parts.append(' '*indent) - parts.append(self.collect_diff_text(want.text, got.text)) - parts.append('\n') - want_children = list(want) - got_children = list(got) - while want_children or got_children: - if not want_children: - parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) - continue - if not got_children: - parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) - continue - parts.append(self.collect_diff( - want_children.pop(0), got_children.pop(0), html, indent+2)) - parts.append(' '*indent) - parts.append(self.collect_diff_end_tag(want, got)) - parts.append('\n') - if strip(want.tail) or strip(got.tail): - parts.append(' '*indent) - parts.append(self.collect_diff_text(want.tail, got.tail)) - parts.append('\n') - return ''.join(parts) - - def collect_diff_tag(self, want, got): - if not self.tag_compare(want.tag, got.tag): - tag = '%s (got: %s)' % (want.tag, got.tag) - else: - tag = got.tag - attrs = [] - any = want.tag == 'any' or 'any' in want.attrib - for name, value in sorted(got.attrib.items()): - if name not in want.attrib and not any: - attrs.append('-%s="%s"' % (name, self.format_text(value, False))) - else: - if name in want.attrib: - text = self.collect_diff_text(want.attrib[name], value, False) - else: - text = self.format_text(value, False) - attrs.append('%s="%s"' % (name, text)) - if not any: - for name, value in sorted(want.attrib.items()): - if name in got.attrib: - continue - attrs.append('+%s="%s"' % (name, self.format_text(value, False))) - if attrs: - tag = '<%s %s>' % (tag, ' '.join(attrs)) - else: - tag = '<%s>' % tag - return tag - - def collect_diff_end_tag(self, want, got): - if want.tag != got.tag: - tag = '%s (got: %s)' % (want.tag, got.tag) - else: - tag = got.tag - return '' % tag - - def collect_diff_text(self, want, got, strip=True): - if self.text_compare(want, got, strip): - if not got: - return '' - return self.format_text(got, strip) - text = '%s (got: %s)' % (want, got) - return self.format_text(text, strip) - -class LHTMLOutputChecker(LXMLOutputChecker): - def get_default_parser(self): - return html_fromstring - -def install(html=False): - """ - Install doctestcompare for all future doctests. - - If html is true, then by default the HTML parser will be used; - otherwise the XML parser is used. - """ - if html: - doctest.OutputChecker = LHTMLOutputChecker - else: - doctest.OutputChecker = LXMLOutputChecker - -def temp_install(html=False, del_module=None): - """ - Use this *inside* a doctest to enable this checker for this - doctest only. - - If html is true, then by default the HTML parser will be used; - otherwise the XML parser is used. - """ - if html: - Checker = LHTMLOutputChecker - else: - Checker = LXMLOutputChecker - frame = _find_doctest_frame() - dt_self = frame.f_locals['self'] - checker = Checker() - old_checker = dt_self._checker - dt_self._checker = checker - # The unfortunate thing is that there is a local variable 'check' - # in the function that runs the doctests, that is a bound method - # into the output checker. We have to update that. We can't - # modify the frame, so we have to modify the object in place. The - # only way to do this is to actually change the func_code - # attribute of the method. We change it, and then wait for - # __record_outcome to be run, which signals the end of the __run - # method, at which point we restore the previous check_output - # implementation. - if _IS_PYTHON_3: - check_func = frame.f_locals['check'].__func__ - checker_check_func = checker.check_output.__func__ - else: - check_func = frame.f_locals['check'].im_func - checker_check_func = checker.check_output.im_func - # Because we can't patch up func_globals, this is the only global - # in check_output that we care about: - doctest.etree = etree - _RestoreChecker(dt_self, old_checker, checker, - check_func, checker_check_func, - del_module) - -class _RestoreChecker(object): - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, - del_module): - self.dt_self = dt_self - self.checker = old_checker - self.checker._temp_call_super_check_output = self.call_super - self.checker._temp_override_self = new_checker - self.check_func = check_func - self.clone_func = clone_func - self.del_module = del_module - self.install_clone() - self.install_dt_self() - def install_clone(self): - if _IS_PYTHON_3: - self.func_code = self.check_func.__code__ - self.func_globals = self.check_func.__globals__ - self.check_func.__code__ = self.clone_func.__code__ - else: - self.func_code = self.check_func.func_code - self.func_globals = self.check_func.func_globals - self.check_func.func_code = self.clone_func.func_code - def uninstall_clone(self): - if _IS_PYTHON_3: - self.check_func.__code__ = self.func_code - else: - self.check_func.func_code = self.func_code - def install_dt_self(self): - self.prev_func = self.dt_self._DocTestRunner__record_outcome - self.dt_self._DocTestRunner__record_outcome = self - def uninstall_dt_self(self): - self.dt_self._DocTestRunner__record_outcome = self.prev_func - def uninstall_module(self): - if self.del_module: - import sys - del sys.modules[self.del_module] - if '.' in self.del_module: - package, module = self.del_module.rsplit('.', 1) - package_mod = sys.modules[package] - delattr(package_mod, module) - def __call__(self, *args, **kw): - self.uninstall_clone() - self.uninstall_dt_self() - del self.checker._temp_override_self - del self.checker._temp_call_super_check_output - result = self.prev_func(*args, **kw) - self.uninstall_module() - return result - def call_super(self, *args, **kw): - self.uninstall_clone() - try: - return self.check_func(*args, **kw) - finally: - self.install_clone() - -def _find_doctest_frame(): - import sys - frame = sys._getframe(1) - while frame: - l = frame.f_locals - if 'BOOM' in l: - # Sign of doctest - return frame - frame = frame.f_back - raise LookupError( - "Could not find doctest (only use this function *inside* a doctest)") - -__test__ = { - 'basic': ''' - >>> temp_install() - >>> print """stuff""" - ... - >>> print """""" - - - - >>> print """blahblahblah""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS - ...foo /> - '''} - -if __name__ == '__main__': - import doctest - doctest.testmod() - - diff --git a/lib/lxml/dtd.pxi b/lib/lxml/dtd.pxi deleted file mode 100644 index d1913b42..00000000 --- a/lib/lxml/dtd.pxi +++ /dev/null @@ -1,468 +0,0 @@ -# support for DTD validation -from lxml.includes cimport dtdvalid - -class DTDError(LxmlError): - u"""Base class for DTD errors. - """ - pass - -class DTDParseError(DTDError): - u"""Error while parsing a DTD. - """ - pass - -class DTDValidateError(DTDError): - u"""Error while validating an XML document with a DTD. - """ - pass - -cdef inline int _assertValidDTDNode(node, void *c_node) except -1: - assert c_node is not NULL, u"invalid DTD proxy at %s" % id(node) - - -@cython.final -@cython.internal -@cython.freelist(8) -cdef class _DTDElementContentDecl: - cdef DTD _dtd - cdef tree.xmlElementContent* _c_node - - def __repr__(self): - return "<%s.%s object name=%r type=%r occur=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.type, self.occur, id(self)) - - property name: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.name) if self._c_node.name is not NULL else None - - property type: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - cdef int type = self._c_node.type - if type == tree.XML_ELEMENT_CONTENT_PCDATA: - return "pcdata" - elif type == tree.XML_ELEMENT_CONTENT_ELEMENT: - return "element" - elif type == tree.XML_ELEMENT_CONTENT_SEQ: - return "seq" - elif type == tree.XML_ELEMENT_CONTENT_OR: - return "or" - else: - return None - - property occur: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - cdef int occur = self._c_node.ocur - if occur == tree.XML_ELEMENT_CONTENT_ONCE: - return "once" - elif occur == tree.XML_ELEMENT_CONTENT_OPT: - return "opt" - elif occur == tree.XML_ELEMENT_CONTENT_MULT: - return "mult" - elif occur == tree.XML_ELEMENT_CONTENT_PLUS: - return "plus" - else: - return None - - property left: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - c1 = self._c_node.c1 - if c1: - node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl) - node._dtd = self._dtd - node._c_node = c1 - return node - else: - return None - - property right: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - c2 = self._c_node.c2 - if c2: - node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl) - node._dtd = self._dtd - node._c_node = c2 - return node - else: - return None - - -@cython.final -@cython.internal -@cython.freelist(8) -cdef class _DTDAttributeDecl: - cdef DTD _dtd - cdef tree.xmlAttribute* _c_node - - def __repr__(self): - return "<%s.%s object name=%r elemname=%r prefix=%r type=%r default=%r default_value=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.elemname, self.prefix, self.type, self.default, self.default_value, id(self)) - - property name: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.name) if self._c_node.name is not NULL else None - - property elemname: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.elem) if self._c_node.elem is not NULL else None - - property prefix: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None - - property type: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - cdef int type = self._c_node.atype - if type == tree.XML_ATTRIBUTE_CDATA: - return "cdata" - elif type == tree.XML_ATTRIBUTE_ID: - return "id" - elif type == tree.XML_ATTRIBUTE_IDREF: - return "idref" - elif type == tree.XML_ATTRIBUTE_IDREFS: - return "idrefs" - elif type == tree.XML_ATTRIBUTE_ENTITY: - return "entity" - elif type == tree.XML_ATTRIBUTE_ENTITIES: - return "entities" - elif type == tree.XML_ATTRIBUTE_NMTOKEN: - return "nmtoken" - elif type == tree.XML_ATTRIBUTE_NMTOKENS: - return "nmtokens" - elif type == tree.XML_ATTRIBUTE_ENUMERATION: - return "enumeration" - elif type == tree.XML_ATTRIBUTE_NOTATION: - return "notation" - else: - return None - - property default: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - cdef int default = self._c_node.def_ - if default == tree.XML_ATTRIBUTE_NONE: - return "none" - elif default == tree.XML_ATTRIBUTE_REQUIRED: - return "required" - elif default == tree.XML_ATTRIBUTE_IMPLIED: - return "implied" - elif default == tree.XML_ATTRIBUTE_FIXED: - return "fixed" - else: - return None - - property default_value: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.defaultValue) if self._c_node.defaultValue is not NULL else None - - def itervalues(self): - _assertValidDTDNode(self, self._c_node) - cdef tree.xmlEnumeration *c_node = self._c_node.tree - while c_node is not NULL: - yield funicode(c_node.name) - c_node = c_node.next - - def values(self): - return list(self.itervalues()) - - -@cython.final -@cython.internal -@cython.freelist(8) -cdef class _DTDElementDecl: - cdef DTD _dtd - cdef tree.xmlElement* _c_node - - def __repr__(self): - return "<%s.%s object name=%r prefix=%r type=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.prefix, self.type, id(self)) - - property name: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.name) if self._c_node.name is not NULL else None - - property prefix: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None - - property type: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - cdef int type = self._c_node.etype - if type == tree.XML_ELEMENT_TYPE_UNDEFINED: - return "undefined" - elif type == tree.XML_ELEMENT_TYPE_EMPTY: - return "empty" - elif type == tree.XML_ELEMENT_TYPE_ANY: - return "any" - elif type == tree.XML_ELEMENT_TYPE_MIXED: - return "mixed" - elif type == tree.XML_ELEMENT_TYPE_ELEMENT: - return "element" - else: - return None - - property content: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - cdef tree.xmlElementContent *content = self._c_node.content - if content: - node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl) - node._dtd = self._dtd - node._c_node = content - return node - else: - return None - - def iterattributes(self): - _assertValidDTDNode(self, self._c_node) - cdef tree.xmlAttribute *c_node = self._c_node.attributes - while c_node: - node = <_DTDAttributeDecl>_DTDAttributeDecl.__new__(_DTDAttributeDecl) - node._dtd = self._dtd - node._c_node = c_node - yield node - c_node = c_node.nexth - - def attributes(self): - return list(self.iterattributes()) - - -@cython.final -@cython.internal -@cython.freelist(8) -cdef class _DTDEntityDecl: - cdef DTD _dtd - cdef tree.xmlEntity* _c_node - def __repr__(self): - return "<%s.%s object name=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) - - property name: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.name) if self._c_node.name is not NULL else None - - property orig: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.orig) if self._c_node.orig is not NULL else None - - property content: - def __get__(self): - _assertValidDTDNode(self, self._c_node) - return funicode(self._c_node.content) if self._c_node.content is not NULL else None - - -################################################################################ -# DTD - -cdef class DTD(_Validator): - u"""DTD(self, file=None, external_id=None) - A DTD validator. - - Can load from filesystem directly given a filename or file-like object. - Alternatively, pass the keyword parameter ``external_id`` to load from a - catalog. - """ - cdef tree.xmlDtd* _c_dtd - def __init__(self, file=None, *, external_id=None): - _Validator.__init__(self) - if file is not None: - if _isString(file): - file = _encodeFilename(file) - with self._error_log: - self._c_dtd = xmlparser.xmlParseDTD(NULL, _xcstr(file)) - elif hasattr(file, 'read'): - self._c_dtd = _parseDtdFromFilelike(file) - else: - raise DTDParseError, u"file must be a filename or file-like object" - elif external_id is not None: - with self._error_log: - self._c_dtd = xmlparser.xmlParseDTD(external_id, NULL) - else: - raise DTDParseError, u"either filename or external ID required" - - if self._c_dtd is NULL: - raise DTDParseError( - self._error_log._buildExceptionMessage(u"error parsing DTD"), - self._error_log) - - property name: - def __get__(self): - if self._c_dtd is NULL: - return None - return funicodeOrNone(self._c_dtd.name) - - property external_id: - def __get__(self): - if self._c_dtd is NULL: - return None - return funicodeOrNone(self._c_dtd.ExternalID) - - property system_url: - def __get__(self): - if self._c_dtd is NULL: - return None - return funicodeOrNone(self._c_dtd.SystemID) - - def iterelements(self): - cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL - while c_node is not NULL: - if c_node.type == tree.XML_ELEMENT_DECL: - node = _DTDElementDecl() - node._dtd = self - node._c_node = c_node - yield node - c_node = c_node.next - - def elements(self): - return list(self.iterelements()) - - def iterentities(self): - cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL - while c_node is not NULL: - if c_node.type == tree.XML_ENTITY_DECL: - node = _DTDEntityDecl() - node._dtd = self - node._c_node = c_node - yield node - c_node = c_node.next - - def entities(self): - return list(self.iterentities()) - - def __dealloc__(self): - tree.xmlFreeDtd(self._c_dtd) - - def __call__(self, etree): - u"""__call__(self, etree) - - Validate doc using the DTD. - - Returns true if the document is valid, false if not. - """ - cdef _Document doc - cdef _Element root_node - cdef xmlDoc* c_doc - cdef dtdvalid.xmlValidCtxt* valid_ctxt - cdef int ret = -1 - - assert self._c_dtd is not NULL, "DTD not initialised" - doc = _documentOrRaise(etree) - root_node = _rootNodeOrRaise(etree) - - valid_ctxt = dtdvalid.xmlNewValidCtxt() - if valid_ctxt is NULL: - raise DTDError(u"Failed to create validation context") - - # work around error reporting bug in libxml2 <= 2.9.1 (and later?) - # https://bugzilla.gnome.org/show_bug.cgi?id=724903 - valid_ctxt.error = _nullGenericErrorFunc - valid_ctxt.userData = NULL - - try: - with self._error_log: - c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) - ret = dtdvalid.xmlValidateDtd(valid_ctxt, c_doc, self._c_dtd) - _destroyFakeDoc(doc._c_doc, c_doc) - finally: - dtdvalid.xmlFreeValidCtxt(valid_ctxt) - - if ret == -1: - raise DTDValidateError(u"Internal error in DTD validation", - self._error_log) - return ret == 1 - - -cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL: - cdef _ExceptionContext exc_context - cdef _FileReaderContext dtd_parser - cdef _ErrorLog error_log - cdef tree.xmlDtd* c_dtd - exc_context = _ExceptionContext() - dtd_parser = _FileReaderContext(file, exc_context, None) - error_log = _ErrorLog() - - with error_log: - c_dtd = dtd_parser._readDtd() - - exc_context._raise_if_stored() - if c_dtd is NULL: - raise DTDParseError(u"error parsing DTD", error_log) - return c_dtd - -cdef DTD _dtdFactory(tree.xmlDtd* c_dtd): - # do not run through DTD.__init__()! - cdef DTD dtd - if c_dtd is NULL: - return None - dtd = DTD.__new__(DTD) - dtd._c_dtd = _copyDtd(c_dtd) - _Validator.__init__(dtd) - return dtd - - -cdef tree.xmlDtd* _copyDtd(tree.xmlDtd* c_orig_dtd) except NULL: - """ - Copy a DTD. libxml2 (currently) fails to set up the element->attributes - links when copying DTDs, so we have to rebuild them here. - """ - c_dtd = tree.xmlCopyDtd(c_orig_dtd) - if not c_dtd: - raise MemoryError - cdef tree.xmlNode* c_node = c_dtd.children - while c_node: - if c_node.type == tree.XML_ATTRIBUTE_DECL: - _linkDtdAttribute(c_dtd, c_node) - c_node = c_node.next - return c_dtd - - -cdef void _linkDtdAttribute(tree.xmlDtd* c_dtd, tree.xmlAttribute* c_attr): - """ - Create the link to the DTD attribute declaration from the corresponding - element declaration. - """ - c_elem = dtdvalid.xmlGetDtdElementDesc(c_dtd, c_attr.elem) - if not c_elem: - # no such element? something is wrong with the DTD ... - return - c_pos = c_elem.attributes - if not c_pos: - c_elem.attributes = c_attr - c_attr.nexth = NULL - return - # libxml2 keeps namespace declarations first, and we need to make - # sure we don't re-insert attributes that are already there - if _isDtdNsDecl(c_attr): - if not _isDtdNsDecl(c_pos): - c_elem.attributes = c_attr - c_attr.nexth = c_pos - return - while c_pos != c_attr and c_pos.nexth and _isDtdNsDecl(c_pos.nexth): - c_pos = c_pos.nexth - else: - # append at end - while c_pos != c_attr and c_pos.nexth: - c_pos = c_pos.nexth - if c_pos == c_attr: - return - c_attr.nexth = c_pos.nexth - c_pos.nexth = c_attr - - -cdef bint _isDtdNsDecl(tree.xmlAttribute* c_attr): - if cstring_h.strcmp(c_attr.name, "xmlns") == 0: - return True - if (c_attr.prefix is not NULL and - cstring_h.strcmp(c_attr.prefix, "xmlns") == 0): - return True - return False diff --git a/lib/lxml/extensions.pxi b/lib/lxml/extensions.pxi deleted file mode 100644 index 531036ef..00000000 --- a/lib/lxml/extensions.pxi +++ /dev/null @@ -1,855 +0,0 @@ -# support for extension functions in XPath and XSLT - -class XPathError(LxmlError): - u"""Base class of all XPath errors. - """ - pass - -class XPathEvalError(XPathError): - u"""Error during XPath evaluation. - """ - pass - -class XPathFunctionError(XPathEvalError): - u"""Internal error looking up an XPath extension function. - """ - pass - -class XPathResultError(XPathEvalError): - u"""Error handling an XPath result. - """ - pass - -# forward declarations - -ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf) -cdef class _ExsltRegExp - -################################################################################ -# Base class for XSLT and XPath evaluation contexts: functions, namespaces, ... - -@cython.internal -cdef class _BaseContext: - cdef xpath.xmlXPathContext* _xpathCtxt - cdef _Document _doc - cdef dict _extensions - cdef list _namespaces - cdef list _global_namespaces - cdef dict _utf_refs - cdef dict _function_cache - cdef dict _eval_context_dict - cdef bint _build_smart_strings - # for exception handling and temporary reference keeping: - cdef _TempStore _temp_refs - cdef set _temp_documents - cdef _ExceptionContext _exc - cdef _ErrorLog _error_log - - def __cinit__(self): - self._xpathCtxt = NULL - - def __init__(self, namespaces, extensions, error_log, enable_regexp, - build_smart_strings): - cdef _ExsltRegExp _regexp - cdef dict new_extensions - cdef list ns - self._utf_refs = {} - self._global_namespaces = [] - self._function_cache = {} - self._eval_context_dict = None - self._error_log = error_log - - if extensions is not None: - # convert extensions to UTF-8 - if isinstance(extensions, dict): - extensions = (extensions,) - # format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function} - new_extensions = {} - for extension in extensions: - for (ns_uri, name), function in extension.items(): - if name is None: - raise ValueError, u"extensions must have non empty names" - ns_utf = self._to_utf(ns_uri) - name_utf = self._to_utf(name) - new_extensions[(ns_utf, name_utf)] = function - extensions = new_extensions or None - - if namespaces is not None: - if isinstance(namespaces, dict): - namespaces = namespaces.items() - if namespaces: - ns = [] - for prefix, ns_uri in namespaces: - if prefix is None or not prefix: - raise TypeError, \ - u"empty namespace prefix is not supported in XPath" - if ns_uri is None or not ns_uri: - raise TypeError, \ - u"setting default namespace is not supported in XPath" - prefix_utf = self._to_utf(prefix) - ns_uri_utf = self._to_utf(ns_uri) - ns.append( (prefix_utf, ns_uri_utf) ) - namespaces = ns - else: - namespaces = None - - self._doc = None - self._exc = _ExceptionContext() - self._extensions = extensions - self._namespaces = namespaces - self._temp_refs = _TempStore() - self._temp_documents = set() - self._build_smart_strings = build_smart_strings - - if enable_regexp: - _regexp = _ExsltRegExp() - _regexp._register_in_context(self) - - cdef _BaseContext _copy(self): - cdef _BaseContext context - if self._namespaces is not None: - namespaces = self._namespaces[:] - else: - namespaces = None - context = self.__class__(namespaces, None, self._error_log, False, - self._build_smart_strings) - if self._extensions is not None: - context._extensions = self._extensions.copy() - return context - - cdef bytes _to_utf(self, s): - u"Convert to UTF-8 and keep a reference to the encoded string" - cdef python.PyObject* dict_result - if s is None: - return None - dict_result = python.PyDict_GetItem(self._utf_refs, s) - if dict_result is not NULL: - return dict_result - utf = _utf8(s) - self._utf_refs[s] = utf - if python.IS_PYPY: - # use C level refs, PyPy refs are not enough! - python.Py_INCREF(utf) - return utf - - cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt): - self._xpathCtxt = xpathCtxt - xpathCtxt.userData = self - xpathCtxt.error = _receiveXPathError - - @cython.final - cdef _register_context(self, _Document doc): - self._doc = doc - self._exc.clear() - - @cython.final - cdef _cleanup_context(self): - #xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt) - #self.unregisterGlobalNamespaces() - if python.IS_PYPY: - # clean up double refs in PyPy (see "_to_utf()" method) - for ref in self._utf_refs.itervalues(): - python.Py_DECREF(ref) - self._utf_refs.clear() - self._eval_context_dict = None - self._doc = None - - @cython.final - cdef _release_context(self): - if self._xpathCtxt is not NULL: - self._xpathCtxt.userData = NULL - self._xpathCtxt = NULL - - # namespaces (internal UTF-8 methods with leading '_') - - cdef addNamespace(self, prefix, ns_uri): - cdef list namespaces - if prefix is None: - raise TypeError, u"empty prefix is not supported in XPath" - prefix_utf = self._to_utf(prefix) - ns_uri_utf = self._to_utf(ns_uri) - new_item = (prefix_utf, ns_uri_utf) - if self._namespaces is None: - self._namespaces = [new_item] - else: - namespaces = [] - for item in self._namespaces: - if item[0] == prefix_utf: - item = new_item - new_item = None - namespaces.append(item) - if new_item is not None: - namespaces.append(new_item) - self._namespaces = namespaces - if self._xpathCtxt is not NULL: - xpath.xmlXPathRegisterNs( - self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf)) - - cdef registerNamespace(self, prefix, ns_uri): - if prefix is None: - raise TypeError, u"empty prefix is not supported in XPath" - prefix_utf = self._to_utf(prefix) - ns_uri_utf = self._to_utf(ns_uri) - self._global_namespaces.append(prefix_utf) - xpath.xmlXPathRegisterNs(self._xpathCtxt, - _xcstr(prefix_utf), _xcstr(ns_uri_utf)) - - cdef registerLocalNamespaces(self): - if self._namespaces is None: - return - for prefix_utf, ns_uri_utf in self._namespaces: - xpath.xmlXPathRegisterNs( - self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf)) - - cdef registerGlobalNamespaces(self): - cdef list ns_prefixes = _find_all_extension_prefixes() - if python.PyList_GET_SIZE(ns_prefixes) > 0: - for prefix_utf, ns_uri_utf in ns_prefixes: - self._global_namespaces.append(prefix_utf) - xpath.xmlXPathRegisterNs( - self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf)) - - cdef unregisterGlobalNamespaces(self): - if python.PyList_GET_SIZE(self._global_namespaces) > 0: - for prefix_utf in self._global_namespaces: - xpath.xmlXPathRegisterNs(self._xpathCtxt, - _xcstr(prefix_utf), NULL) - del self._global_namespaces[:] - - cdef void _unregisterNamespace(self, prefix_utf): - xpath.xmlXPathRegisterNs(self._xpathCtxt, - _xcstr(prefix_utf), NULL) - - # extension functions - - cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1: - if self._extensions is None: - self._extensions = {} - self._extensions[(ns_utf, name_utf)] = function - return 0 - - cdef registerGlobalFunctions(self, void* ctxt, - _register_function reg_func): - cdef python.PyObject* dict_result - cdef dict d - for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems(): - dict_result = python.PyDict_GetItem( - self._function_cache, ns_utf) - if dict_result is not NULL: - d = dict_result - else: - d = {} - self._function_cache[ns_utf] = d - for name_utf, function in ns_functions.iteritems(): - d[name_utf] = function - reg_func(ctxt, name_utf, ns_utf) - - cdef registerLocalFunctions(self, void* ctxt, - _register_function reg_func): - cdef python.PyObject* dict_result - cdef dict d - if self._extensions is None: - return # done - last_ns = None - d = None - for (ns_utf, name_utf), function in self._extensions.iteritems(): - if ns_utf is not last_ns or d is None: - last_ns = ns_utf - dict_result = python.PyDict_GetItem( - self._function_cache, ns_utf) - if dict_result is not NULL: - d = dict_result - else: - d = {} - self._function_cache[ns_utf] = d - d[name_utf] = function - reg_func(ctxt, name_utf, ns_utf) - - cdef unregisterAllFunctions(self, void* ctxt, - _register_function unreg_func): - for ns_utf, functions in self._function_cache.iteritems(): - for name_utf in functions: - unreg_func(ctxt, name_utf, ns_utf) - - cdef unregisterGlobalFunctions(self, void* ctxt, - _register_function unreg_func): - for ns_utf, functions in self._function_cache.items(): - for name_utf in functions: - if self._extensions is None or \ - (ns_utf, name_utf) not in self._extensions: - unreg_func(ctxt, name_utf, ns_utf) - - @cython.final - cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name): - u"""Lookup an extension function in the cache and return it. - - Parameters: c_ns_uri may be NULL, c_name must not be NULL - """ - cdef python.PyObject* c_dict - cdef python.PyObject* dict_result - c_dict = python.PyDict_GetItem( - self._function_cache, None if c_ns_uri is NULL else c_ns_uri) - if c_dict is not NULL: - dict_result = python.PyDict_GetItem( - c_dict, c_name) - if dict_result is not NULL: - return dict_result - return None - - # Python access to the XPath context for extension functions - - property context_node: - def __get__(self): - cdef xmlNode* c_node - if self._xpathCtxt is NULL: - raise XPathError, \ - u"XPath context is only usable during the evaluation" - c_node = self._xpathCtxt.node - if c_node is NULL: - raise XPathError, u"no context node" - if c_node.doc != self._xpathCtxt.doc: - raise XPathError, \ - u"document-external context nodes are not supported" - if self._doc is None: - raise XPathError, u"document context is missing" - return _elementFactory(self._doc, c_node) - - property eval_context: - def __get__(self): - if self._eval_context_dict is None: - self._eval_context_dict = {} - return self._eval_context_dict - - # Python reference keeping during XPath function evaluation - - @cython.final - cdef _release_temp_refs(self): - u"Free temporarily referenced objects from this context." - self._temp_refs.clear() - self._temp_documents.clear() - - @cython.final - cdef _hold(self, obj): - u"""A way to temporarily hold references to nodes in the evaluator. - - This is needed because otherwise nodes created in XPath extension - functions would be reference counted too soon, during the XPath - evaluation. This is most important in the case of exceptions. - """ - cdef _Element element - if isinstance(obj, _Element): - self._temp_refs.add(obj) - self._temp_documents.add((<_Element>obj)._doc) - return - elif _isString(obj) or not python.PySequence_Check(obj): - return - for o in obj: - if isinstance(o, _Element): - #print "Holding element:", element._c_node - self._temp_refs.add(o) - #print "Holding document:", element._doc._c_doc - self._temp_documents.add((<_Element>o)._doc) - - @cython.final - cdef _Document _findDocumentForNode(self, xmlNode* c_node): - u"""If an XPath expression returns an element from a different - document than the current context document, we call this to - see if it was possibly created by an extension and is a known - document instance. - """ - cdef _Document doc - for doc in self._temp_documents: - if doc is not None and doc._c_doc is c_node.doc: - return doc - return None - - -# libxml2 keeps these error messages in a static array in its code -# and doesn't give us access to them ... - -cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = ( - b"Ok", - b"Number encoding", - b"Unfinished literal", - b"Start of literal", - b"Expected $ for variable reference", - b"Undefined variable", - b"Invalid predicate", - b"Invalid expression", - b"Missing closing curly brace", - b"Unregistered function", - b"Invalid operand", - b"Invalid type", - b"Invalid number of arguments", - b"Invalid context size", - b"Invalid context position", - b"Memory allocation error", - b"Syntax error", - b"Resource error", - b"Sub resource error", - b"Undefined namespace prefix", - b"Encoding error", - b"Char out of XML range", - b"Invalid or incomplete context", - b"Stack usage error", -) - -cdef void _forwardXPathError(void* c_ctxt, xmlerror.xmlError* c_error) with gil: - cdef xmlerror.xmlError error - cdef int xpath_code - if c_error.message is not NULL: - error.message = c_error.message - else: - xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK - if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES): - error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code]) - else: - error.message = b"unknown error" - error.domain = c_error.domain - error.code = c_error.code - error.level = c_error.level - error.line = c_error.line - error.int2 = c_error.int1 # column - error.file = c_error.file - - (<_BaseContext>c_ctxt)._error_log._receive(&error) - -cdef void _receiveXPathError(void* c_context, xmlerror.xmlError* error) nogil: - if not __DEBUG: - return - if c_context is NULL: - _forwardError(NULL, error) - else: - _forwardXPathError(c_context, error) - - -def Extension(module, function_mapping=None, *, ns=None): - u"""Extension(module, function_mapping=None, ns=None) - - Build a dictionary of extension functions from the functions - defined in a module or the methods of an object. - - As second argument, you can pass an additional mapping of - attribute names to XPath function names, or a list of function - names that should be taken. - - The ``ns`` keyword argument accepts a namespace URI for the XPath - functions. - """ - cdef dict functions = {} - if isinstance(function_mapping, dict): - for function_name, xpath_name in function_mapping.items(): - functions[(ns, xpath_name)] = getattr(module, function_name) - else: - if function_mapping is None: - function_mapping = [ name for name in dir(module) - if not name.startswith(u'_') ] - for function_name in function_mapping: - functions[(ns, function_name)] = getattr(module, function_name) - return functions - -################################################################################ -# EXSLT regexp implementation - -@cython.final -@cython.internal -cdef class _ExsltRegExp: - cdef dict _compile_map - def __cinit__(self): - self._compile_map = {} - - cdef _make_string(self, value): - if _isString(value): - return value - elif isinstance(value, list): - # node set: take recursive text concatenation of first element - if python.PyList_GET_SIZE(value) == 0: - return u'' - firstnode = value[0] - if _isString(firstnode): - return firstnode - elif isinstance(firstnode, _Element): - c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node) - if c_text is NULL: - raise MemoryError() - try: - return funicode(c_text) - finally: - tree.xmlFree(c_text) - else: - return unicode(firstnode) - else: - return unicode(value) - - cdef _compile(self, rexp, ignore_case): - cdef python.PyObject* c_result - rexp = self._make_string(rexp) - key = (rexp, ignore_case) - c_result = python.PyDict_GetItem(self._compile_map, key) - if c_result is not NULL: - return c_result - py_flags = re.UNICODE - if ignore_case: - py_flags = py_flags | re.IGNORECASE - rexp_compiled = re.compile(rexp, py_flags) - self._compile_map[key] = rexp_compiled - return rexp_compiled - - def test(self, ctxt, s, rexp, flags=u''): - flags = self._make_string(flags) - s = self._make_string(s) - rexpc = self._compile(rexp, u'i' in flags) - if rexpc.search(s) is None: - return False - else: - return True - - def match(self, ctxt, s, rexp, flags=u''): - cdef list result_list - flags = self._make_string(flags) - s = self._make_string(s) - rexpc = self._compile(rexp, u'i' in flags) - if u'g' in flags: - results = rexpc.findall(s) - if not results: - return () - else: - result = rexpc.search(s) - if not result: - return () - results = [ result.group() ] - results.extend( result.groups(u'') ) - result_list = [] - root = Element(u'matches') - join_groups = u''.join - for s_match in results: - if python.PyTuple_CheckExact(s_match): - s_match = join_groups(s_match) - elem = SubElement(root, u'match') - elem.text = s_match - result_list.append(elem) - return result_list - - def replace(self, ctxt, s, rexp, flags, replacement): - replacement = self._make_string(replacement) - flags = self._make_string(flags) - s = self._make_string(s) - rexpc = self._compile(rexp, u'i' in flags) - if u'g' in flags: - count = 0 - else: - count = 1 - return rexpc.sub(replacement, s, count) - - cdef _register_in_context(self, _BaseContext context): - ns = b"http://exslt.org/regular-expressions" - context._addLocalExtensionFunction(ns, b"test", self.test) - context._addLocalExtensionFunction(ns, b"match", self.match) - context._addLocalExtensionFunction(ns, b"replace", self.replace) - - -################################################################################ -# helper functions - -cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc, - _BaseContext context) except NULL: - cdef xpath.xmlNodeSet* resultSet - cdef _Element fake_node = None - cdef xmlNode* c_node - - if isinstance(obj, unicode): - obj = _utf8(obj) - if isinstance(obj, bytes): - # libxml2 copies the string value - return xpath.xmlXPathNewCString(_cstr(obj)) - if isinstance(obj, bool): - return xpath.xmlXPathNewBoolean(obj) - if python.PyNumber_Check(obj): - return xpath.xmlXPathNewFloat(obj) - if obj is None: - resultSet = xpath.xmlXPathNodeSetCreate(NULL) - elif isinstance(obj, _Element): - resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node) - elif python.PySequence_Check(obj): - resultSet = xpath.xmlXPathNodeSetCreate(NULL) - try: - for value in obj: - if isinstance(value, _Element): - if context is not None: - context._hold(value) - xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node) - else: - if context is None or doc is None: - raise XPathResultError, \ - u"Non-Element values not supported at this point - got %r" % value - # support strings by appending text nodes to an Element - if isinstance(value, unicode): - value = _utf8(value) - if isinstance(value, bytes): - if fake_node is None: - fake_node = _makeElement("text-root", NULL, doc, None, - None, None, None, None, None) - context._hold(fake_node) - else: - # append a comment node to keep the text nodes separate - c_node = tree.xmlNewDocComment(doc._c_doc, "") - if c_node is NULL: - raise MemoryError() - tree.xmlAddChild(fake_node._c_node, c_node) - context._hold(value) - c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value)) - if c_node is NULL: - raise MemoryError() - tree.xmlAddChild(fake_node._c_node, c_node) - xpath.xmlXPathNodeSetAdd(resultSet, c_node) - else: - raise XPathResultError, \ - u"This is not a supported node-set result: %r" % value - except: - xpath.xmlXPathFreeNodeSet(resultSet) - raise - else: - raise XPathResultError, u"Unknown return type: %s" % \ - python._fqtypename(obj).decode('utf8') - return xpath.xmlXPathWrapNodeSet(resultSet) - -cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj, - _Document doc, _BaseContext context): - if xpathObj.type == xpath.XPATH_UNDEFINED: - raise XPathResultError, u"Undefined xpath result" - elif xpathObj.type == xpath.XPATH_NODESET: - return _createNodeSetResult(xpathObj, doc, context) - elif xpathObj.type == xpath.XPATH_BOOLEAN: - return xpathObj.boolval - elif xpathObj.type == xpath.XPATH_NUMBER: - return xpathObj.floatval - elif xpathObj.type == xpath.XPATH_STRING: - stringval = funicode(xpathObj.stringval) - if context._build_smart_strings: - stringval = _elementStringResultFactory( - stringval, None, None, 0) - return stringval - elif xpathObj.type == xpath.XPATH_POINT: - raise NotImplementedError, u"XPATH_POINT" - elif xpathObj.type == xpath.XPATH_RANGE: - raise NotImplementedError, u"XPATH_RANGE" - elif xpathObj.type == xpath.XPATH_LOCATIONSET: - raise NotImplementedError, u"XPATH_LOCATIONSET" - elif xpathObj.type == xpath.XPATH_USERS: - raise NotImplementedError, u"XPATH_USERS" - elif xpathObj.type == xpath.XPATH_XSLT_TREE: - return _createNodeSetResult(xpathObj, doc, context) - else: - raise XPathResultError, u"Unknown xpath result %s" % unicode(xpathObj.type) - -cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc, - _BaseContext context): - cdef xmlNode* c_node - cdef int i - cdef list result - result = [] - if xpathObj.nodesetval is NULL: - return result - for i in range(xpathObj.nodesetval.nodeNr): - c_node = xpathObj.nodesetval.nodeTab[i] - _unpackNodeSetEntry(result, c_node, doc, context, - xpathObj.type == xpath.XPATH_XSLT_TREE) - return result - -cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc, - _BaseContext context, bint is_fragment): - cdef xmlNode* c_child - if _isElement(c_node): - if c_node.doc != doc._c_doc and c_node.doc._private is NULL: - # XXX: works, but maybe not always the right thing to do? - # XPath: only runs when extensions create or copy trees - # -> we store Python refs to these, so that is OK - # XSLT: can it leak when merging trees from multiple sources? - c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) - # FIXME: call _instantiateElementFromXPath() instead? - results.append( - _fakeDocElementFactory(doc, c_node)) - elif c_node.type == tree.XML_TEXT_NODE or \ - c_node.type == tree.XML_CDATA_SECTION_NODE or \ - c_node.type == tree.XML_ATTRIBUTE_NODE: - results.append( - _buildElementStringResult(doc, c_node, context)) - elif c_node.type == tree.XML_NAMESPACE_DECL: - results.append( (funicodeOrNone((c_node).prefix), - funicodeOrNone((c_node).href)) ) - elif c_node.type == tree.XML_DOCUMENT_NODE or \ - c_node.type == tree.XML_HTML_DOCUMENT_NODE: - # ignored for everything but result tree fragments - if is_fragment: - c_child = c_node.children - while c_child is not NULL: - _unpackNodeSetEntry(results, c_child, doc, context, 0) - c_child = c_child.next - elif c_node.type == tree.XML_XINCLUDE_START or \ - c_node.type == tree.XML_XINCLUDE_END: - pass - else: - raise NotImplementedError, \ - u"Not yet implemented result node type: %d" % c_node.type - -cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj): - u"""Free the XPath object, but *never* free the *content* of node sets. - Python dealloc will do that for us. - """ - if xpathObj.nodesetval is not NULL: - xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval) - xpathObj.nodesetval = NULL - xpath.xmlXPathFreeObject(xpathObj) - -cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc, - _BaseContext context): - # NOTE: this may copy the element - only call this when it can't leak - if c_node.doc != doc._c_doc and c_node.doc._private is NULL: - # not from the context document and not from a fake document - # either => may still be from a known document, e.g. one - # created by an extension function - doc = context._findDocumentForNode(c_node) - if doc is None: - # not from a known document at all! => can only make a - # safety copy here - c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) - return _fakeDocElementFactory(doc, c_node) - -################################################################################ -# special str/unicode subclasses - -@cython.final -cdef class _ElementUnicodeResult(unicode): - cdef _Element _parent - cdef readonly object attrname - cdef readonly bint is_tail - cdef readonly bint is_text - cdef readonly bint is_attribute - - def getparent(self): - return self._parent - -class _ElementStringResult(bytes): - # we need to use a Python class here, bytes cannot be C-subclassed - # in Pyrex/Cython - def getparent(self): - return self._parent - -cdef object _elementStringResultFactory(string_value, _Element parent, - attrname, bint is_tail): - cdef _ElementUnicodeResult uresult - cdef bint is_text - cdef bint is_attribute = attrname is not None - if parent is None: - is_text = 0 - else: - is_text = not (is_tail or is_attribute) - - if type(string_value) is bytes: - result = _ElementStringResult(string_value) - result._parent = parent - result.is_attribute = is_attribute - result.is_tail = is_tail - result.is_text = is_text - result.attrname = attrname - return result - else: - uresult = _ElementUnicodeResult(string_value) - uresult._parent = parent - uresult.is_attribute = is_attribute - uresult.is_tail = is_tail - uresult.is_text = is_text - uresult.attrname = attrname - return uresult - -cdef object _buildElementStringResult(_Document doc, xmlNode* c_node, - _BaseContext context): - cdef _Element parent = None - cdef object attrname = None - cdef xmlNode* c_element - cdef bint is_tail - - if c_node.type == tree.XML_ATTRIBUTE_NODE: - attrname = _namespacedName(c_node) - is_tail = 0 - s = tree.xmlNodeGetContent(c_node) - try: - value = funicode(s) - finally: - tree.xmlFree(s) - c_element = NULL - else: - #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type" - # may be tail text or normal text - value = funicode(c_node.content) - c_element = _previousElement(c_node) - is_tail = c_element is not NULL - - if not context._build_smart_strings: - return value - - if c_element is NULL: - # non-tail text or attribute text - c_element = c_node.parent - while c_element is not NULL and not _isElement(c_element): - c_element = c_element.parent - - if c_element is not NULL: - parent = _instantiateElementFromXPath(c_element, doc, context) - - return _elementStringResultFactory( - value, parent, attrname, is_tail) - -################################################################################ -# callbacks for XPath/XSLT extension functions - -cdef void _extension_function_call(_BaseContext context, function, - xpath.xmlXPathParserContext* ctxt, int nargs): - cdef _Document doc - cdef xpath.xmlXPathObject* obj - cdef list args - cdef int i - doc = context._doc - try: - args = [] - for i in range(nargs): - obj = xpath.valuePop(ctxt) - o = _unwrapXPathObject(obj, doc, context) - _freeXPathObject(obj) - args.append(o) - args.reverse() - - res = function(context, *args) - # wrap result for XPath consumption - obj = _wrapXPathObject(res, doc, context) - # prevent Python from deallocating elements handed to libxml2 - context._hold(res) - xpath.valuePush(ctxt, obj) - except: - xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR) - context._exc._store_raised() - finally: - return # swallow any further exceptions - -# lookup the function by name and call it - -cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt, - int nargs) with gil: - cdef _BaseContext context - cdef xpath.xmlXPathContext* rctxt = ctxt.context - context = <_BaseContext> rctxt.userData - try: - function = context._find_cached_function(rctxt.functionURI, rctxt.function) - if function is not None: - _extension_function_call(context, function, ctxt, nargs) - else: - xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR) - context._exc._store_exception( - XPathFunctionError(u"XPath function '%s' not found" % - _namespacedNameFromNsName(rctxt.functionURI, rctxt.function))) - except: - # may not be the right error, but we need to tell libxml2 *something* - xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR) - context._exc._store_raised() - finally: - return # swallow any further exceptions diff --git a/lib/lxml/html/ElementSoup.py b/lib/lxml/html/ElementSoup.py deleted file mode 100644 index 8e4fde13..00000000 --- a/lib/lxml/html/ElementSoup.py +++ /dev/null @@ -1,10 +0,0 @@ -__doc__ = """Legacy interface to the BeautifulSoup HTML parser. -""" - -__all__ = ["parse", "convert_tree"] - -from soupparser import convert_tree, parse as _parse - -def parse(file, beautifulsoup=None, makeelement=None): - root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement) - return root.getroot() diff --git a/lib/lxml/html/__init__.py b/lib/lxml/html/__init__.py deleted file mode 100644 index fe28c3bb..00000000 --- a/lib/lxml/html/__init__.py +++ /dev/null @@ -1,1697 +0,0 @@ -# Copyright (c) 2004 Ian Bicking. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in -# the documentation and/or other materials provided with the -# distribution. -# -# 3. Neither the name of Ian Bicking nor the names of its contributors may -# be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""The ``lxml.html`` tool set for HTML handling. -""" - -import sys -import re -try: - from urlparse import urljoin -except ImportError: - # Python 3 - from urllib.parse import urljoin -import copy -from lxml import etree -from lxml.html import defs -from lxml.html._setmixin import SetMixin -try: - from collections import MutableMapping as DictMixin -except ImportError: - # Python < 2.6 - from UserDict import DictMixin -try: - set -except NameError: - # Python 2.3 - from sets import Set as set -try: - bytes -except NameError: - # Python < 2.6 - bytes = str -try: - unicode -except NameError: - # Python 3 - unicode = str -try: - basestring -except NameError: - # Python 3 - basestring = (str, bytes) - -def __fix_docstring(s): - if not s: - return s - import sys - if sys.version_info[0] >= 3: - sub = re.compile(r"^(\s*)u'", re.M).sub - else: - sub = re.compile(r"^(\s*)b'", re.M).sub - return sub(r"\1'", s) - -__all__ = [ - 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', - 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', - 'find_rel_links', 'find_class', 'make_links_absolute', - 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] - -XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" - -_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", - namespaces={'x':XHTML_NAMESPACE}) -_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", - namespaces={'x':XHTML_NAMESPACE}) -_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", - namespaces={'x':XHTML_NAMESPACE}) -#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) -_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") -_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") -_collect_string_content = etree.XPath("string()") -_css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) -_css_import_re = re.compile(r'@import "(.*?)"') -_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", - namespaces={'x':XHTML_NAMESPACE}) -_archive_re = re.compile(r'[^ ]+') - -def _unquote_match(s, pos): - if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": - return s[1:-1], pos+1 - else: - return s,pos - -def _transform_result(typ, result): - """Convert the result back into the input type. - """ - if issubclass(typ, bytes): - return tostring(result, encoding='utf-8') - elif issubclass(typ, unicode): - return tostring(result, encoding='unicode') - else: - return result - -def _nons(tag): - if isinstance(tag, basestring): - if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: - return tag.split('}')[-1] - return tag - -class HtmlMixin(object): - - def base_url(self): - """ - Returns the base URL, given when the page was parsed. - - Use with ``urlparse.urljoin(el.base_url, href)`` to get - absolute URLs. - """ - return self.getroottree().docinfo.URL - base_url = property(base_url, doc=base_url.__doc__) - - def forms(self): - """ - Return a list of all the forms - """ - return _forms_xpath(self) - forms = property(forms, doc=forms.__doc__) - - def body(self): - """ - Return the element. Can be called from a child element - to get the document's head. - """ - return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] - body = property(body, doc=body.__doc__) - - def head(self): - """ - Returns the element. Can be called from a child - element to get the document's head. - """ - return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] - head = property(head, doc=head.__doc__) - - def _label__get(self): - """ - Get or set any