From 32c029c3cf639a48c35daa003caa68731308670e Mon Sep 17 00:00:00 2001 From: echel0n Date: Fri, 28 Mar 2014 21:32:46 -0700 Subject: [PATCH] Added lxml to our libs --- lib/lxml/ElementInclude.py | 223 + lib/lxml/__init__.py | 20 + lib/lxml/_elementpath.py | 306 ++ lib/lxml/apihelpers.pxi | 1645 +++++++ lib/lxml/builder.py | 238 + lib/lxml/classlookup.pxi | 565 +++ lib/lxml/cleanup.pxi | 210 + lib/lxml/cssselect.py | 103 + lib/lxml/cvarargs.pxd | 8 + lib/lxml/debug.pxi | 91 + lib/lxml/docloader.pxi | 175 + lib/lxml/doctestcompare.py | 505 ++ lib/lxml/dtd.pxi | 468 ++ lib/lxml/extensions.pxi | 855 ++++ lib/lxml/html/ElementSoup.py | 10 + lib/lxml/html/__init__.py | 1697 +++++++ lib/lxml/html/_diffcommand.py | 87 + lib/lxml/html/_html5builder.py | 100 + lib/lxml/html/_setmixin.py | 115 + lib/lxml/html/builder.py | 133 + lib/lxml/html/clean.py | 724 +++ lib/lxml/html/defs.py | 137 + lib/lxml/html/diff.py | 881 ++++ lib/lxml/html/formfill.py | 299 ++ lib/lxml/html/html5parser.py | 207 + lib/lxml/html/soupparser.py | 125 + lib/lxml/html/tests/__init__.py | 1 + .../feedparser-data/entry_content_applet.data | 7 + .../feedparser-data/entry_content_blink.data | 8 + .../feedparser-data/entry_content_crazy.data | 84 + .../feedparser-data/entry_content_embed.data | 8 + .../feedparser-data/entry_content_frame.data | 7 + .../feedparser-data/entry_content_iframe.data | 8 + .../feedparser-data/entry_content_link.data | 7 + .../feedparser-data/entry_content_meta.data | 7 + .../feedparser-data/entry_content_object.data | 8 + .../entry_content_onabort.data | 7 + .../feedparser-data/entry_content_onblur.data | 7 + .../entry_content_onchange.data | 7 + .../entry_content_onclick.data | 7 + .../entry_content_ondblclick.data | 7 + .../entry_content_onerror.data | 7 + .../entry_content_onfocus.data | 7 + .../entry_content_onkeydown.data | 7 + .../entry_content_onkeypress.data | 7 + .../entry_content_onkeyup.data | 7 + .../feedparser-data/entry_content_onload.data | 7 + .../entry_content_onmousedown.data | 7 + .../entry_content_onmouseout.data | 7 + .../entry_content_onmouseover.data | 7 + .../entry_content_onmouseup.data | 7 + .../entry_content_onreset.data | 7 + .../entry_content_onresize.data | 7 + .../entry_content_onsubmit.data | 7 + .../entry_content_onunload.data | 7 + .../feedparser-data/entry_content_script.data | 7 + .../entry_content_script_cdata.data | 13 + .../entry_content_script_inline.data | 7 + .../feedparser-data/entry_content_style.data | 7 + .../background-image-plus.data | 8 + .../background-image-with-unicoded.data | 10 + .../hackers-org-data/downlevel-hidden.data | 9 + .../hackers-org-data/html-plus-time.data | 12 + .../hackers-org-data/javascript-link.data | 15 + .../tests/hackers-org-data/style-comment.data | 8 + .../hackers-org-data/style-expression.data | 10 + .../tests/hackers-org-data/style-import.data | 8 + .../tests/hackers-org-data/style-js-tag.data | 7 + .../tests/hackers-org-data/style-url-js.data | 8 + .../hackers-org-data/xml-data-island.data | 10 + .../hackers-org-data/xml-embedded-js.data | 9 + .../xml-namespace.data.BROKEN | 16 + lib/lxml/html/tests/test_autolink.py | 11 + lib/lxml/html/tests/test_autolink.txt | 79 + lib/lxml/html/tests/test_basic.py | 13 + lib/lxml/html/tests/test_basic.txt | 162 + lib/lxml/html/tests/test_clean.py | 80 + lib/lxml/html/tests/test_clean.txt | 161 + lib/lxml/html/tests/test_clean_embed.txt | 39 + lib/lxml/html/tests/test_diff.py | 14 + lib/lxml/html/tests/test_diff.txt | 252 + lib/lxml/html/tests/test_elementsoup.py | 33 + lib/lxml/html/tests/test_feedparser_data.py | 98 + lib/lxml/html/tests/test_formfill.py | 8 + lib/lxml/html/tests/test_formfill.txt | 112 + lib/lxml/html/tests/test_forms.py | 11 + lib/lxml/html/tests/test_forms.txt | 195 + lib/lxml/html/tests/test_frames.py | 36 + lib/lxml/html/tests/test_html5parser.py | 429 ++ lib/lxml/html/tests/test_rewritelinks.py | 11 + lib/lxml/html/tests/test_rewritelinks.txt | 245 + lib/lxml/html/tests/test_xhtml.py | 11 + lib/lxml/html/tests/test_xhtml.txt | 30 + .../html/tests/transform_feedparser_data.py | 110 + lib/lxml/html/usedoctest.py | 13 + lib/lxml/includes/__init__.py | 0 lib/lxml/includes/c14n.pxd | 26 + lib/lxml/includes/config.pxd | 3 + lib/lxml/includes/dtdvalid.pxd | 18 + lib/lxml/includes/etree_defs.h | 328 ++ lib/lxml/includes/etreepublic.pxd | 234 + lib/lxml/includes/htmlparser.pxd | 56 + lib/lxml/includes/relaxng.pxd | 64 + lib/lxml/includes/schematron.pxd | 34 + lib/lxml/includes/tree.pxd | 464 ++ lib/lxml/includes/uri.pxd | 5 + lib/lxml/includes/xinclude.pxd | 22 + lib/lxml/includes/xmlerror.pxd | 850 ++++ lib/lxml/includes/xmlparser.pxd | 218 + lib/lxml/includes/xmlschema.pxd | 35 + lib/lxml/includes/xpath.pxd | 135 + lib/lxml/includes/xslt.pxd | 176 + lib/lxml/isoschematron/__init__.py | 316 ++ .../resources/rng/iso-schematron.rng | 622 +++ .../resources/xsl/RNG2Schtrn.xsl | 75 + .../resources/xsl/XSD2Schtrn.xsl | 77 + .../iso_abstract_expand.xsl | 296 ++ .../iso-schematron-xslt1/iso_dsdl_include.xsl | 1160 +++++ .../iso_schematron_message.xsl | 55 + .../iso_schematron_skeleton_for_xslt1.xsl | 1796 +++++++ .../iso_svrl_for_xslt1.xsl | 588 +++ .../xsl/iso-schematron-xslt1/readme.txt | 83 + lib/lxml/iterparse.pxi | 357 ++ lib/lxml/lxml.etree.pyx | 3332 +++++++++++++ lib/lxml/lxml.objectify.pyx | 1985 ++++++++ lib/lxml/lxml_endian.h | 14 + lib/lxml/nsclasses.pxi | 239 + lib/lxml/objectpath.pxi | 330 ++ lib/lxml/parser.pxi | 1780 +++++++ lib/lxml/parsertarget.pxi | 168 + lib/lxml/proxy.pxi | 503 ++ lib/lxml/public-api.pxi | 172 + lib/lxml/pyclasslookup.py | 3 + lib/lxml/python.pxd | 135 + lib/lxml/readonlytree.pxi | 554 +++ lib/lxml/relaxng.pxi | 123 + lib/lxml/sax.py | 247 + lib/lxml/saxparser.pxi | 763 +++ lib/lxml/schematron.pxi | 164 + lib/lxml/serializer.pxi | 903 ++++ lib/lxml/tests/__init__.py | 4 + lib/lxml/tests/common_imports.py | 303 ++ lib/lxml/tests/dummy_http_server.py | 83 + lib/lxml/tests/include/test_xinclude.xml | 4 + lib/lxml/tests/shakespeare.html | 526 +++ lib/lxml/tests/test-document.xslt | 10 + lib/lxml/tests/test-string.xml | 2 + lib/lxml/tests/test.dtd | 11 + lib/lxml/tests/test.sch | 8 + lib/lxml/tests/test.xml | 2 + lib/lxml/tests/test.xsd | 8 + lib/lxml/tests/test1.rng | 6 + lib/lxml/tests/test1.xslt | 9 + lib/lxml/tests/test2.rng | 13 + lib/lxml/tests/test2.xslt | 8 + lib/lxml/tests/test_broken.xml | 1 + lib/lxml/tests/test_builder.py | 38 + lib/lxml/tests/test_classlookup.py | 405 ++ lib/lxml/tests/test_css.py | 65 + lib/lxml/tests/test_doctestcompare.py | 95 + lib/lxml/tests/test_dtd.py | 315 ++ lib/lxml/tests/test_elementtree.py | 4136 ++++++++++++++++ lib/lxml/tests/test_errors.py | 50 + lib/lxml/tests/test_etree.py | 4163 +++++++++++++++++ lib/lxml/tests/test_htmlparser.py | 516 ++ lib/lxml/tests/test_http_io.py | 131 + lib/lxml/tests/test_import.xsd | 10 + lib/lxml/tests/test_inc.xsd | 10 + lib/lxml/tests/test_incremental_xmlfile.py | 248 + lib/lxml/tests/test_io.py | 386 ++ lib/lxml/tests/test_isoschematron.py | 851 ++++ lib/lxml/tests/test_nsclasses.py | 215 + lib/lxml/tests/test_objectify.py | 2586 ++++++++++ lib/lxml/tests/test_pyclasslookup.py | 345 ++ lib/lxml/tests/test_relaxng.py | 180 + lib/lxml/tests/test_sax.py | 277 ++ lib/lxml/tests/test_schematron.py | 84 + lib/lxml/tests/test_threading.py | 422 ++ lib/lxml/tests/test_unicode.py | 99 + lib/lxml/tests/test_xmlschema.py | 442 ++ lib/lxml/tests/test_xpathevaluator.py | 750 +++ lib/lxml/tests/test_xslt.py | 1848 ++++++++ lib/lxml/usedoctest.py | 13 + lib/lxml/xinclude.pxi | 64 + lib/lxml/xmlerror.pxi | 1581 +++++++ lib/lxml/xmlid.pxi | 179 + lib/lxml/xmlschema.pxi | 214 + lib/lxml/xpath.pxi | 506 ++ lib/lxml/xslt.pxi | 926 ++++ lib/lxml/xsltext.pxi | 243 + 190 files changed, 55421 insertions(+) create mode 100644 lib/lxml/ElementInclude.py create mode 100644 lib/lxml/__init__.py create mode 100644 lib/lxml/_elementpath.py create mode 100644 lib/lxml/apihelpers.pxi create mode 100644 lib/lxml/builder.py create mode 100644 lib/lxml/classlookup.pxi create mode 100644 lib/lxml/cleanup.pxi create mode 100644 lib/lxml/cssselect.py create mode 100644 lib/lxml/cvarargs.pxd create mode 100644 lib/lxml/debug.pxi create mode 100644 lib/lxml/docloader.pxi create mode 100644 lib/lxml/doctestcompare.py create mode 100644 lib/lxml/dtd.pxi create mode 100644 lib/lxml/extensions.pxi create mode 100644 lib/lxml/html/ElementSoup.py create mode 100644 lib/lxml/html/__init__.py create mode 100644 lib/lxml/html/_diffcommand.py create mode 100644 lib/lxml/html/_html5builder.py create mode 100644 lib/lxml/html/_setmixin.py create mode 100644 lib/lxml/html/builder.py create mode 100644 lib/lxml/html/clean.py create mode 100644 lib/lxml/html/defs.py create mode 100644 lib/lxml/html/diff.py create mode 100644 lib/lxml/html/formfill.py create mode 100644 lib/lxml/html/html5parser.py create mode 100644 lib/lxml/html/soupparser.py create mode 100644 lib/lxml/html/tests/__init__.py create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_applet.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_blink.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_crazy.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_embed.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_frame.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_iframe.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_link.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_meta.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_object.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onabort.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onblur.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onchange.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onclick.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_ondblclick.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onerror.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onfocus.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onkeydown.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onkeypress.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onkeyup.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onload.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onmousedown.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onmouseout.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onmouseover.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onmouseup.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onreset.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onresize.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onsubmit.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_onunload.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_script.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_script_cdata.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_script_inline.data create mode 100644 lib/lxml/html/tests/feedparser-data/entry_content_style.data create mode 100644 lib/lxml/html/tests/hackers-org-data/background-image-plus.data create mode 100644 lib/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data create mode 100644 lib/lxml/html/tests/hackers-org-data/downlevel-hidden.data create mode 100644 lib/lxml/html/tests/hackers-org-data/html-plus-time.data create mode 100644 lib/lxml/html/tests/hackers-org-data/javascript-link.data create mode 100644 lib/lxml/html/tests/hackers-org-data/style-comment.data create mode 100644 lib/lxml/html/tests/hackers-org-data/style-expression.data create mode 100644 lib/lxml/html/tests/hackers-org-data/style-import.data create mode 100644 lib/lxml/html/tests/hackers-org-data/style-js-tag.data create mode 100644 lib/lxml/html/tests/hackers-org-data/style-url-js.data create mode 100644 lib/lxml/html/tests/hackers-org-data/xml-data-island.data create mode 100644 lib/lxml/html/tests/hackers-org-data/xml-embedded-js.data create mode 100644 lib/lxml/html/tests/hackers-org-data/xml-namespace.data.BROKEN create mode 100644 lib/lxml/html/tests/test_autolink.py create mode 100644 lib/lxml/html/tests/test_autolink.txt create mode 100644 lib/lxml/html/tests/test_basic.py create mode 100644 lib/lxml/html/tests/test_basic.txt create mode 100644 lib/lxml/html/tests/test_clean.py create mode 100644 lib/lxml/html/tests/test_clean.txt create mode 100644 lib/lxml/html/tests/test_clean_embed.txt create mode 100644 lib/lxml/html/tests/test_diff.py create mode 100644 lib/lxml/html/tests/test_diff.txt create mode 100644 lib/lxml/html/tests/test_elementsoup.py create mode 100644 lib/lxml/html/tests/test_feedparser_data.py create mode 100644 lib/lxml/html/tests/test_formfill.py create mode 100644 lib/lxml/html/tests/test_formfill.txt create mode 100644 lib/lxml/html/tests/test_forms.py create mode 100644 lib/lxml/html/tests/test_forms.txt create mode 100644 lib/lxml/html/tests/test_frames.py create mode 100644 lib/lxml/html/tests/test_html5parser.py create mode 100644 lib/lxml/html/tests/test_rewritelinks.py create mode 100644 lib/lxml/html/tests/test_rewritelinks.txt create mode 100644 lib/lxml/html/tests/test_xhtml.py create mode 100644 lib/lxml/html/tests/test_xhtml.txt create mode 100644 lib/lxml/html/tests/transform_feedparser_data.py create mode 100644 lib/lxml/html/usedoctest.py create mode 100644 lib/lxml/includes/__init__.py create mode 100644 lib/lxml/includes/c14n.pxd create mode 100644 lib/lxml/includes/config.pxd create mode 100644 lib/lxml/includes/dtdvalid.pxd create mode 100644 lib/lxml/includes/etree_defs.h create mode 100644 lib/lxml/includes/etreepublic.pxd create mode 100644 lib/lxml/includes/htmlparser.pxd create mode 100644 lib/lxml/includes/relaxng.pxd create mode 100644 lib/lxml/includes/schematron.pxd create mode 100644 lib/lxml/includes/tree.pxd create mode 100644 lib/lxml/includes/uri.pxd create mode 100644 lib/lxml/includes/xinclude.pxd create mode 100644 lib/lxml/includes/xmlerror.pxd create mode 100644 lib/lxml/includes/xmlparser.pxd create mode 100644 lib/lxml/includes/xmlschema.pxd create mode 100644 lib/lxml/includes/xpath.pxd create mode 100644 lib/lxml/includes/xslt.pxd create mode 100644 lib/lxml/isoschematron/__init__.py create mode 100644 lib/lxml/isoschematron/resources/rng/iso-schematron.rng create mode 100644 lib/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl create mode 100644 lib/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt create mode 100644 lib/lxml/iterparse.pxi create mode 100644 lib/lxml/lxml.etree.pyx create mode 100644 lib/lxml/lxml.objectify.pyx create mode 100644 lib/lxml/lxml_endian.h create mode 100644 lib/lxml/nsclasses.pxi create mode 100644 lib/lxml/objectpath.pxi create mode 100644 lib/lxml/parser.pxi create mode 100644 lib/lxml/parsertarget.pxi create mode 100644 lib/lxml/proxy.pxi create mode 100644 lib/lxml/public-api.pxi create mode 100644 lib/lxml/pyclasslookup.py create mode 100644 lib/lxml/python.pxd create mode 100644 lib/lxml/readonlytree.pxi create mode 100644 lib/lxml/relaxng.pxi create mode 100644 lib/lxml/sax.py create mode 100644 lib/lxml/saxparser.pxi create mode 100644 lib/lxml/schematron.pxi create mode 100644 lib/lxml/serializer.pxi create mode 100644 lib/lxml/tests/__init__.py create mode 100644 lib/lxml/tests/common_imports.py create mode 100644 lib/lxml/tests/dummy_http_server.py create mode 100644 lib/lxml/tests/include/test_xinclude.xml create mode 100644 lib/lxml/tests/shakespeare.html create mode 100644 lib/lxml/tests/test-document.xslt create mode 100644 lib/lxml/tests/test-string.xml create mode 100644 lib/lxml/tests/test.dtd create mode 100644 lib/lxml/tests/test.sch create mode 100644 lib/lxml/tests/test.xml create mode 100644 lib/lxml/tests/test.xsd create mode 100644 lib/lxml/tests/test1.rng create mode 100644 lib/lxml/tests/test1.xslt create mode 100644 lib/lxml/tests/test2.rng create mode 100644 lib/lxml/tests/test2.xslt create mode 100644 lib/lxml/tests/test_broken.xml create mode 100644 lib/lxml/tests/test_builder.py create mode 100644 lib/lxml/tests/test_classlookup.py create mode 100644 lib/lxml/tests/test_css.py create mode 100644 lib/lxml/tests/test_doctestcompare.py create mode 100644 lib/lxml/tests/test_dtd.py create mode 100644 lib/lxml/tests/test_elementtree.py create mode 100644 lib/lxml/tests/test_errors.py create mode 100644 lib/lxml/tests/test_etree.py create mode 100644 lib/lxml/tests/test_htmlparser.py create mode 100644 lib/lxml/tests/test_http_io.py create mode 100644 lib/lxml/tests/test_import.xsd create mode 100644 lib/lxml/tests/test_inc.xsd create mode 100644 lib/lxml/tests/test_incremental_xmlfile.py create mode 100644 lib/lxml/tests/test_io.py create mode 100644 lib/lxml/tests/test_isoschematron.py create mode 100644 lib/lxml/tests/test_nsclasses.py create mode 100644 lib/lxml/tests/test_objectify.py create mode 100644 lib/lxml/tests/test_pyclasslookup.py create mode 100644 lib/lxml/tests/test_relaxng.py create mode 100644 lib/lxml/tests/test_sax.py create mode 100644 lib/lxml/tests/test_schematron.py create mode 100644 lib/lxml/tests/test_threading.py create mode 100644 lib/lxml/tests/test_unicode.py create mode 100644 lib/lxml/tests/test_xmlschema.py create mode 100644 lib/lxml/tests/test_xpathevaluator.py create mode 100644 lib/lxml/tests/test_xslt.py create mode 100644 lib/lxml/usedoctest.py create mode 100644 lib/lxml/xinclude.pxi create mode 100644 lib/lxml/xmlerror.pxi create mode 100644 lib/lxml/xmlid.pxi create mode 100644 lib/lxml/xmlschema.pxi create mode 100644 lib/lxml/xpath.pxi create mode 100644 lib/lxml/xslt.pxi create mode 100644 lib/lxml/xsltext.pxi diff --git a/lib/lxml/ElementInclude.py b/lib/lxml/ElementInclude.py new file mode 100644 index 00000000..f7806709 --- /dev/null +++ b/lib/lxml/ElementInclude.py @@ -0,0 +1,223 @@ +# +# ElementTree +# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $ +# +# limited xinclude support for element trees +# +# history: +# 2003-08-15 fl created +# 2003-11-14 fl fixed default loader +# +# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +""" +Limited XInclude support for the ElementTree package. + +While lxml.etree has full support for XInclude (see +`etree.ElementTree.xinclude()`), this module provides a simpler, pure +Python, ElementTree compatible implementation that supports a simple +form of custom URL resolvers. +""" + +from lxml import etree +import copy +try: + from urlparse import urljoin + from urllib2 import urlopen +except ImportError: + # Python 3 + from urllib.parse import urljoin + from urllib.request import urlopen + +try: + set +except NameError: + # Python 2.3 + from sets import Set as set + +XINCLUDE = "{http://www.w3.org/2001/XInclude}" + +XINCLUDE_INCLUDE = XINCLUDE + "include" +XINCLUDE_FALLBACK = XINCLUDE + "fallback" + +## +# Fatal include error. + +class FatalIncludeError(etree.LxmlSyntaxError): + pass + +## +# ET compatible default loader. +# This loader reads an included resource from disk. +# +# @param href Resource reference. +# @param parse Parse mode. Either "xml" or "text". +# @param encoding Optional text encoding. +# @return The expanded resource. If the parse mode is "xml", this +# is an ElementTree instance. If the parse mode is "text", this +# is a Unicode string. If the loader fails, it can return None +# or raise an IOError exception. +# @throws IOError If the loader fails to load the resource. + +def default_loader(href, parse, encoding=None): + file = open(href, 'rb') + if parse == "xml": + data = etree.parse(file).getroot() + else: + data = file.read() + if not encoding: + encoding = 'utf-8' + data = data.decode(encoding) + file.close() + return data + +## +# Default loader used by lxml.etree - handles custom resolvers properly +# + +def _lxml_default_loader(href, parse, encoding=None, parser=None): + if parse == "xml": + data = etree.parse(href, parser).getroot() + else: + if "://" in href: + f = urlopen(href) + else: + f = open(href, 'rb') + data = f.read() + f.close() + if not encoding: + encoding = 'utf-8' + data = data.decode(encoding) + return data + +## +# Wrapper for ET compatibility - drops the parser + +def _wrap_et_loader(loader): + def load(href, parse, encoding=None, parser=None): + return loader(href, parse, encoding) + return load + + +## +# Expand XInclude directives. +# +# @param elem Root element. +# @param loader Optional resource loader. If omitted, it defaults +# to {@link default_loader}. If given, it should be a callable +# that implements the same interface as default_loader. +# @throws FatalIncludeError If the function fails to include a given +# resource, or if the tree contains malformed XInclude elements. +# @throws IOError If the function fails to load a given resource. +# @returns the node or its replacement if it was an XInclude node + +def include(elem, loader=None, base_url=None): + if base_url is None: + if hasattr(elem, 'getroot'): + tree = elem + elem = elem.getroot() + else: + tree = elem.getroottree() + if hasattr(tree, 'docinfo'): + base_url = tree.docinfo.URL + elif hasattr(elem, 'getroot'): + elem = elem.getroot() + _include(elem, loader, base_url=base_url) + +def _include(elem, loader=None, _parent_hrefs=None, base_url=None): + if loader is not None: + load_include = _wrap_et_loader(loader) + else: + load_include = _lxml_default_loader + + if _parent_hrefs is None: + _parent_hrefs = set() + + parser = elem.getroottree().parser + + include_elements = list( + elem.iter('{http://www.w3.org/2001/XInclude}*')) + + for e in include_elements: + if e.tag == XINCLUDE_INCLUDE: + # process xinclude directive + href = urljoin(base_url, e.get("href")) + parse = e.get("parse", "xml") + parent = e.getparent() + if parse == "xml": + if href in _parent_hrefs: + raise FatalIncludeError( + "recursive include of %r detected" % href + ) + _parent_hrefs.add(href) + node = load_include(href, parse, parser=parser) + if node is None: + raise FatalIncludeError( + "cannot load %r as %r" % (href, parse) + ) + node = _include(node, loader, _parent_hrefs) + if e.tail: + node.tail = (node.tail or "") + e.tail + if parent is None: + return node # replaced the root node! + parent.replace(e, node) + elif parse == "text": + text = load_include(href, parse, encoding=e.get("encoding")) + if text is None: + raise FatalIncludeError( + "cannot load %r as %r" % (href, parse) + ) + predecessor = e.getprevious() + if predecessor is not None: + predecessor.tail = (predecessor.tail or "") + text + elif parent is None: + return text # replaced the root node! + else: + parent.text = (parent.text or "") + text + (e.tail or "") + parent.remove(e) + else: + raise FatalIncludeError( + "unknown parse type in xi:include tag (%r)" % parse + ) + elif e.tag == XINCLUDE_FALLBACK: + parent = e.getparent() + if parent is not None and parent.tag != XINCLUDE_INCLUDE: + raise FatalIncludeError( + "xi:fallback tag must be child of xi:include (%r)" % e.tag + ) + else: + raise FatalIncludeError( + "Invalid element found in XInclude namespace (%r)" % e.tag + ) + return elem diff --git a/lib/lxml/__init__.py b/lib/lxml/__init__.py new file mode 100644 index 00000000..07cbe3a2 --- /dev/null +++ b/lib/lxml/__init__.py @@ -0,0 +1,20 @@ +# this is a package + +def get_include(): + """ + Returns a list of header include paths (for lxml itself, libxml2 + and libxslt) needed to compile C code against lxml if it was built + with statically linked libraries. + """ + import os + lxml_path = __path__[0] + include_path = os.path.join(lxml_path, 'includes') + includes = [include_path, lxml_path] + + for name in os.listdir(include_path): + path = os.path.join(include_path, name) + if os.path.isdir(path): + includes.append(path) + + return includes + diff --git a/lib/lxml/_elementpath.py b/lib/lxml/_elementpath.py new file mode 100644 index 00000000..bc9176e8 --- /dev/null +++ b/lib/lxml/_elementpath.py @@ -0,0 +1,306 @@ +# +# ElementTree +# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $ +# +# limited xpath support for element trees +# +# history: +# 2003-05-23 fl created +# 2003-05-28 fl added support for // etc +# 2003-08-27 fl fixed parsing of periods in element names +# 2007-09-10 fl new selection engine +# 2007-09-12 fl fixed parent selector +# 2007-09-13 fl added iterfind; changed findall to return a list +# 2007-11-30 fl added namespaces support +# 2009-10-30 fl added child element value filter +# +# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved. +# +# fredrik@pythonware.com +# http://www.pythonware.com +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2009 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +## +# Implementation module for XPath support. There's usually no reason +# to import this module directly; the ElementTree does this for +# you, if needed. +## + +import re + +xpath_tokenizer_re = re.compile( + "(" + "'[^']*'|\"[^\"]*\"|" + "::|" + "//?|" + "\.\.|" + "\(\)|" + "[/.*:\[\]\(\)@=])|" + "((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" + "\s+" + ) + +def xpath_tokenizer(pattern, namespaces=None): + for token in xpath_tokenizer_re.findall(pattern): + tag = token[1] + if tag and tag[0] != "{" and ":" in tag: + try: + prefix, uri = tag.split(":", 1) + if not namespaces: + raise KeyError + yield token[0], "{%s}%s" % (namespaces[prefix], uri) + except KeyError: + raise SyntaxError("prefix %r not found in prefix map" % prefix) + else: + yield token + + +def prepare_child(next, token): + tag = token[1] + def select(result): + for elem in result: + for e in elem.iterchildren(tag): + yield e + return select + +def prepare_star(next, token): + def select(result): + for elem in result: + for e in elem.iterchildren('*'): + yield e + return select + +def prepare_self(next, token): + def select(result): + return result + return select + +def prepare_descendant(next, token): + token = next() + if token[0] == "*": + tag = "*" + elif not token[0]: + tag = token[1] + else: + raise SyntaxError("invalid descendant") + def select(result): + for elem in result: + for e in elem.iterdescendants(tag): + yield e + return select + +def prepare_parent(next, token): + def select(result): + for elem in result: + parent = elem.getparent() + if parent is not None: + yield parent + return select + +def prepare_predicate(next, token): + # FIXME: replace with real parser!!! refs: + # http://effbot.org/zone/simple-iterator-parser.htm + # http://javascript.crockford.com/tdop/tdop.html + signature = [] + predicate = [] + while 1: + token = next() + if token[0] == "]": + break + if token[0] and token[0][:1] in "'\"": + token = "'", token[0][1:-1] + signature.append(token[0] or "-") + predicate.append(token[1]) + signature = "".join(signature) + # use signature to determine predicate type + if signature == "@-": + # [@attribute] predicate + key = predicate[1] + def select(result): + for elem in result: + if elem.get(key) is not None: + yield elem + return select + if signature == "@-='": + # [@attribute='value'] + key = predicate[1] + value = predicate[-1] + def select(result): + for elem in result: + if elem.get(key) == value: + yield elem + return select + if signature == "-" and not re.match("-?\d+$", predicate[0]): + # [tag] + tag = predicate[0] + def select(result): + for elem in result: + for _ in elem.iterchildren(tag): + yield elem + break + return select + if signature == "-='" and not re.match("-?\d+$", predicate[0]): + # [tag='value'] + tag = predicate[0] + value = predicate[-1] + def select(result): + for elem in result: + for e in elem.iterchildren(tag): + if "".join(e.itertext()) == value: + yield elem + break + return select + if signature == "-" or signature == "-()" or signature == "-()-": + # [index] or [last()] or [last()-index] + if signature == "-": + # [index] + index = int(predicate[0]) - 1 + if index < 0: + if index == -1: + raise SyntaxError( + "indices in path predicates are 1-based, not 0-based") + else: + raise SyntaxError("path index >= 1 expected") + else: + if predicate[0] != "last": + raise SyntaxError("unsupported function") + if signature == "-()-": + try: + index = int(predicate[2]) - 1 + except ValueError: + raise SyntaxError("unsupported expression") + else: + index = -1 + def select(result): + for elem in result: + parent = elem.getparent() + if parent is None: + continue + try: + # FIXME: what if the selector is "*" ? + elems = list(parent.iterchildren(elem.tag)) + if elems[index] is elem: + yield elem + except IndexError: + pass + return select + raise SyntaxError("invalid predicate") + +ops = { + "": prepare_child, + "*": prepare_star, + ".": prepare_self, + "..": prepare_parent, + "//": prepare_descendant, + "[": prepare_predicate, + } + +_cache = {} + +# -------------------------------------------------------------------- + +def _build_path_iterator(path, namespaces): + # compile selector pattern + if path[-1:] == "/": + path = path + "*" # implicit all (FIXME: keep this?) + try: + return _cache[(path, namespaces and tuple(sorted(namespaces.items())) or None)] + except KeyError: + pass + if len(_cache) > 100: + _cache.clear() + + if path[:1] == "/": + raise SyntaxError("cannot use absolute path on element") + stream = iter(xpath_tokenizer(path, namespaces)) + try: + _next = stream.next + except AttributeError: + # Python 3 + _next = stream.__next__ + try: + token = _next() + except StopIteration: + raise SyntaxError("empty path expression") + selector = [] + while 1: + try: + selector.append(ops[token[0]](_next, token)) + except StopIteration: + raise SyntaxError("invalid path") + try: + token = _next() + if token[0] == "/": + token = _next() + except StopIteration: + break + _cache[path] = selector + return selector + +## +# Iterate over the matching nodes + +def iterfind(elem, path, namespaces=None): + selector = _build_path_iterator(path, namespaces) + result = iter((elem,)) + for select in selector: + result = select(result) + return result + +## +# Find first matching object. + +def find(elem, path, namespaces=None): + it = iterfind(elem, path, namespaces) + try: + try: + _next = it.next + except AttributeError: + return next(it) + else: + return _next() + except StopIteration: + return None + +## +# Find all matching objects. + +def findall(elem, path, namespaces=None): + return list(iterfind(elem, path, namespaces)) + +## +# Find text for first matching object. + +def findtext(elem, path, default=None, namespaces=None): + el = find(elem, path, namespaces) + if el is None: + return default + else: + return el.text or '' diff --git a/lib/lxml/apihelpers.pxi b/lib/lxml/apihelpers.pxi new file mode 100644 index 00000000..c41e3044 --- /dev/null +++ b/lib/lxml/apihelpers.pxi @@ -0,0 +1,1645 @@ +# Private/public helper functions for API functions + +from lxml.includes cimport uri + +cdef object OrderedDict = None +try: + from collections import OrderedDict +except ImportError: + pass + +cdef void displayNode(xmlNode* c_node, indent): + # to help with debugging + cdef xmlNode* c_child + try: + print indent * u' ', c_node + c_child = c_node.children + while c_child is not NULL: + displayNode(c_child, indent + 1) + c_child = c_child.next + finally: + return # swallow any exceptions + +cdef inline int _assertValidNode(_Element element) except -1: + assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element) + +cdef inline int _assertValidDoc(_Document doc) except -1: + assert doc._c_doc is not NULL, u"invalid Document proxy at %s" % id(doc) + +cdef _Document _documentOrRaise(object input): + u"""Call this to get the document of a _Document, _ElementTree or _Element + object, or to raise an exception if it can't be determined. + + Should be used in all API functions for consistency. + """ + cdef _Document doc + if isinstance(input, _ElementTree): + if (<_ElementTree>input)._context_node is not None: + doc = (<_ElementTree>input)._context_node._doc + else: + doc = None + elif isinstance(input, _Element): + doc = (<_Element>input)._doc + elif isinstance(input, _Document): + doc = <_Document>input + else: + raise TypeError, u"Invalid input object: %s" % \ + python._fqtypename(input).decode('utf8') + if doc is None: + raise ValueError, u"Input object has no document: %s" % \ + python._fqtypename(input).decode('utf8') + _assertValidDoc(doc) + return doc + +cdef _Element _rootNodeOrRaise(object input): + u"""Call this to get the root node of a _Document, _ElementTree or + _Element object, or to raise an exception if it can't be determined. + + Should be used in all API functions for consistency. + """ + cdef _Element node + if isinstance(input, _ElementTree): + node = (<_ElementTree>input)._context_node + elif isinstance(input, _Element): + node = <_Element>input + elif isinstance(input, _Document): + node = (<_Document>input).getroot() + else: + raise TypeError, u"Invalid input object: %s" % \ + python._fqtypename(input).decode('utf8') + if (node is None or not node._c_node or + node._c_node.type != tree.XML_ELEMENT_NODE): + raise ValueError, u"Input object has no element: %s" % \ + python._fqtypename(input).decode('utf8') + _assertValidNode(node) + return node + +cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, + _BaseParser parser, text, tail, attrib, nsmap, + dict extra_attrs): + u"""Create a new element and initialize text content, namespaces and + attributes. + + This helper function will reuse as much of the existing document as + possible: + + If 'parser' is None, the parser will be inherited from 'doc' or the + default parser will be used. + + If 'doc' is None, 'c_doc' is used to create a new _Document and the new + element is made its root node. + + If 'c_doc' is also NULL, a new xmlDoc will be created. + """ + cdef xmlNode* c_node + if doc is not None: + c_doc = doc._c_doc + ns_utf, name_utf = _getNsTag(tag) + if parser is not None and parser._for_html: + _htmlTagValidOrRaise(name_utf) + if c_doc is NULL: + c_doc = _newHTMLDoc() + else: + _tagValidOrRaise(name_utf) + if c_doc is NULL: + c_doc = _newXMLDoc() + c_node = _createElement(c_doc, name_utf) + if c_node is NULL: + if doc is None and c_doc is not NULL: + tree.xmlFreeDoc(c_doc) + raise MemoryError() + try: + if doc is None: + tree.xmlDocSetRootElement(c_doc, c_node) + doc = _documentFactory(c_doc, parser) + if text is not None: + _setNodeText(c_node, text) + if tail is not None: + _setTailText(c_node, tail) + # add namespaces to node if necessary + _initNodeNamespaces(c_node, doc, ns_utf, nsmap) + _initNodeAttributes(c_node, doc, attrib, extra_attrs) + return _elementFactory(doc, c_node) + except: + # free allocated c_node/c_doc unless Python does it for us + if c_node.doc is not c_doc: + # node not yet in document => will not be freed by document + if tail is not None: + _removeText(c_node.next) # tail + tree.xmlFreeNode(c_node) + if doc is None: + # c_doc will not be freed by doc + tree.xmlFreeDoc(c_doc) + raise + +cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf, + _BaseParser parser, attrib, nsmap, dict extra_attrs) except -1: + u"""Initialise a new Element object. + + This is used when users instantiate a Python Element subclass + directly, without it being mapped to an existing XML node. + """ + cdef xmlDoc* c_doc + cdef xmlNode* c_node + cdef _Document doc + if is_html: + _htmlTagValidOrRaise(name_utf) + c_doc = _newHTMLDoc() + else: + _tagValidOrRaise(name_utf) + c_doc = _newXMLDoc() + c_node = _createElement(c_doc, name_utf) + if c_node is NULL: + if c_doc is not NULL: + tree.xmlFreeDoc(c_doc) + raise MemoryError() + tree.xmlDocSetRootElement(c_doc, c_node) + doc = _documentFactory(c_doc, parser) + # add namespaces to node if necessary + _initNodeNamespaces(c_node, doc, ns_utf, nsmap) + _initNodeAttributes(c_node, doc, attrib, extra_attrs) + _registerProxy(element, doc, c_node) + element._init() + return 0 + +cdef _Element _makeSubElement(_Element parent, tag, text, tail, + attrib, nsmap, dict extra_attrs): + u"""Create a new child element and initialize text content, namespaces and + attributes. + """ + cdef xmlNode* c_node + cdef xmlDoc* c_doc + if parent is None or parent._doc is None: + return None + _assertValidNode(parent) + ns_utf, name_utf = _getNsTag(tag) + c_doc = parent._doc._c_doc + + if parent._doc._parser is not None and parent._doc._parser._for_html: + _htmlTagValidOrRaise(name_utf) + else: + _tagValidOrRaise(name_utf) + + c_node = _createElement(c_doc, name_utf) + if c_node is NULL: + raise MemoryError() + tree.xmlAddChild(parent._c_node, c_node) + + try: + if text is not None: + _setNodeText(c_node, text) + if tail is not None: + _setTailText(c_node, tail) + + # add namespaces to node if necessary + _initNodeNamespaces(c_node, parent._doc, ns_utf, nsmap) + _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs) + return _elementFactory(parent._doc, c_node) + except: + # make sure we clean up in case of an error + _removeNode(parent._doc, c_node) + raise + +cdef int _initNodeNamespaces(xmlNode* c_node, _Document doc, + object node_ns_utf, object nsmap) except -1: + u"""Lookup current namespace prefixes, then set namespace structure for + node and register new ns-prefix mappings. + + This only works for a newly created node! + """ + cdef xmlNs* c_ns + cdef list nsdefs + if not nsmap: + if node_ns_utf is not None: + _uriValidOrRaise(node_ns_utf) + doc._setNodeNs(c_node, _xcstr(node_ns_utf)) + return 0 + + nsdefs = list(nsmap.items()) + if None in nsmap and len(nsdefs) > 1: + # Move the default namespace to the end. This makes sure libxml2 + # prefers a prefix if the ns is defined redundantly on the same + # element. That way, users can work around a problem themselves + # where default namespace attributes on non-default namespaced + # elements serialise without prefix (i.e. into the non-default + # namespace). + item = (None, nsmap[None]) + nsdefs.remove(item) + nsdefs.append(item) + + for prefix, href in nsdefs: + href_utf = _utf8(href) + _uriValidOrRaise(href_utf) + c_href = _xcstr(href_utf) + if prefix is not None: + prefix_utf = _utf8(prefix) + _prefixValidOrRaise(prefix_utf) + c_prefix = _xcstr(prefix_utf) + else: + c_prefix = NULL + # add namespace with prefix if it is not already known + c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix) + if c_ns is NULL or \ + c_ns.href is NULL or \ + tree.xmlStrcmp(c_ns.href, c_href) != 0: + c_ns = tree.xmlNewNs(c_node, c_href, c_prefix) + if href_utf == node_ns_utf: + tree.xmlSetNs(c_node, c_ns) + node_ns_utf = None + + if node_ns_utf is not None: + doc._setNodeNs(c_node, _xcstr(node_ns_utf)) + return 0 + +cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra): + u"""Initialise the attributes of an element node. + """ + cdef bint is_html + cdef xmlNs* c_ns + if attrib is not None and not hasattr(attrib, u'items'): + raise TypeError, u"Invalid attribute dictionary: %s" % \ + python._fqtypename(attrib).decode('utf8') + if not attrib and not extra: + return # nothing to do + is_html = doc._parser._for_html + seen = set() + if extra: + for name, value in sorted(extra.items()): + _addAttributeToNode(c_node, doc, is_html, name, value, seen) + if attrib: + # attrib will usually be a plain unordered dict + if type(attrib) is dict: + attrib = sorted(attrib.items()) + elif isinstance(attrib, _Attrib) or ( + OrderedDict is not None and isinstance(attrib, OrderedDict)): + attrib = attrib.items() + else: + # assume it's an unordered mapping of some kind + attrib = sorted(attrib.items()) + for name, value in attrib: + _addAttributeToNode(c_node, doc, is_html, name, value, seen) + +cdef int _addAttributeToNode(xmlNode* c_node, _Document doc, bint is_html, + name, value, set seen_tags) except -1: + ns_utf, name_utf = tag = _getNsTag(name) + if tag in seen_tags: + return 0 + seen_tags.add(tag) + if not is_html: + _attributeValidOrRaise(name_utf) + value_utf = _utf8(value) + if ns_utf is None: + tree.xmlNewProp(c_node, _xcstr(name_utf), _xcstr(value_utf)) + else: + _uriValidOrRaise(ns_utf) + c_ns = doc._findOrBuildNodeNs(c_node, _xcstr(ns_utf), NULL, 1) + tree.xmlNewNsProp(c_node, c_ns, + _xcstr(name_utf), _xcstr(value_utf)) + return 0 + +ctypedef struct _ns_node_ref: + xmlNs* ns + xmlNode* node + +cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element) except -1: + u"""Remove any namespace declarations from a subtree that are not used by + any of its elements (or attributes). + """ + cdef _ns_node_ref* c_ns_list + cdef _ns_node_ref* c_nsref_ptr + cdef xmlNs* c_nsdef + cdef xmlNode* c_node + cdef size_t c_ns_list_size + cdef size_t c_ns_list_len + cdef size_t i + + c_ns_list = NULL + c_ns_list_size = 0 + c_ns_list_len = 0 + + if c_element.parent is not NULL and \ + c_element.parent.type == tree.XML_DOCUMENT_NODE: + # include the document node + c_nsdef = c_element.parent.nsDef + while c_nsdef is not NULL: + if c_ns_list_len >= c_ns_list_size: + if c_ns_list is NULL: + c_ns_list_size = 20 + else: + c_ns_list_size *= 2 + c_nsref_ptr = <_ns_node_ref*> stdlib.realloc( + c_ns_list, c_ns_list_size * sizeof(_ns_node_ref)) + if c_nsref_ptr is NULL: + if c_ns_list is not NULL: + stdlib.free(c_ns_list) + raise MemoryError() + c_ns_list = c_nsref_ptr + + c_ns_list[c_ns_list_len].ns = c_nsdef + c_ns_list[c_ns_list_len].node = c_element.parent + c_ns_list_len += 1 + c_nsdef = c_nsdef.next + + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1) + # collect all new namespace declarations into the ns list + c_nsdef = c_element.nsDef + while c_nsdef is not NULL: + if c_ns_list_len >= c_ns_list_size: + if c_ns_list is NULL: + c_ns_list_size = 20 + else: + c_ns_list_size *= 2 + c_nsref_ptr = <_ns_node_ref*> stdlib.realloc( + c_ns_list, c_ns_list_size * sizeof(_ns_node_ref)) + if c_nsref_ptr is NULL: + if c_ns_list is not NULL: + stdlib.free(c_ns_list) + raise MemoryError() + c_ns_list = c_nsref_ptr + + c_ns_list[c_ns_list_len].ns = c_nsdef + c_ns_list[c_ns_list_len].node = c_element + c_ns_list_len += 1 + c_nsdef = c_nsdef.next + + # remove all namespace declarations from the list that are referenced + if c_element.type == tree.XML_ELEMENT_NODE: + c_node = c_element + while c_node is not NULL: + if c_node.ns is not NULL: + for i in range(c_ns_list_len): + if c_node.ns is c_ns_list[i].ns: + c_ns_list_len -= 1 + c_ns_list[i].ns = c_ns_list[c_ns_list_len].ns + c_ns_list[i].node = c_ns_list[c_ns_list_len].node + c_ns_list[c_ns_list_len].ns = NULL + c_ns_list[c_ns_list_len].node = NULL + break + if c_node is c_element: + # continue with attributes + c_node = c_element.properties + else: + c_node = c_node.next + tree.END_FOR_EACH_ELEMENT_FROM(c_element) + + if c_ns_list is NULL: + return 0 + + # free all namespace declarations that remained in the list + for i in range(c_ns_list_len): + c_node = c_ns_list[i].node + c_nsdef = c_node.nsDef + if c_nsdef is c_ns_list[i].ns: + c_node.nsDef = c_node.nsDef.next + else: + while c_nsdef.next is not c_ns_list[i].ns: + c_nsdef = c_nsdef.next + c_nsdef.next = c_nsdef.next.next + tree.xmlFreeNs(c_ns_list[i].ns) + + if c_ns_list is not NULL: + stdlib.free(c_ns_list) + return 0 + +cdef xmlNs* _searchNsByHref(xmlNode* c_node, const_xmlChar* c_href, bint is_attribute): + u"""Search a namespace declaration that covers a node (element or + attribute). + + For attributes, try to find a prefixed namespace declaration + instead of the default namespaces. This helps in supporting + round-trips for attributes on elements with a different namespace. + """ + cdef xmlNs* c_ns + cdef xmlNs* c_default_ns = NULL + cdef xmlNode* c_element + if c_href is NULL or c_node is NULL or c_node.type == tree.XML_ENTITY_REF_NODE: + return NULL + if tree.xmlStrcmp(c_href, tree.XML_XML_NAMESPACE) == 0: + # no special cases here, let libxml2 handle this + return tree.xmlSearchNsByHref(c_node.doc, c_node, c_href) + if c_node.type == tree.XML_ATTRIBUTE_NODE: + is_attribute = 1 + while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE: + c_node = c_node.parent + c_element = c_node + while c_node is not NULL: + if c_node.type == tree.XML_ELEMENT_NODE: + c_ns = c_node.nsDef + while c_ns is not NULL: + if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0: + if c_ns.prefix is NULL and is_attribute: + # for attributes, continue searching a named + # prefix, but keep the first default namespace + # declaration that we found + if c_default_ns is NULL: + c_default_ns = c_ns + elif tree.xmlSearchNs( + c_element.doc, c_element, c_ns.prefix) is c_ns: + # start node is in namespace scope => found! + return c_ns + c_ns = c_ns.next + if c_node is not c_element and c_node.ns is not NULL: + # optimise: the node may have the namespace itself + c_ns = c_node.ns + if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0: + if c_ns.prefix is NULL and is_attribute: + # for attributes, continue searching a named + # prefix, but keep the first default namespace + # declaration that we found + if c_default_ns is NULL: + c_default_ns = c_ns + elif tree.xmlSearchNs( + c_element.doc, c_element, c_ns.prefix) is c_ns: + # start node is in namespace scope => found! + return c_ns + c_node = c_node.parent + # nothing found => use a matching default namespace or fail + if c_default_ns is not NULL: + if tree.xmlSearchNs(c_element.doc, c_element, NULL) is c_default_ns: + return c_default_ns + return NULL + +cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1: + # NOTE: this does not deallocate the node, just unlink it! + cdef xmlNode* c_parent + cdef xmlNode* c_child + if c_node.children is NULL: + tree.xmlUnlinkNode(c_node) + return 0 + + c_parent = c_node.parent + # fix parent links of children + c_child = c_node.children + while c_child is not NULL: + c_child.parent = c_parent + c_child = c_child.next + + # fix namespace references of children if their parent's namespace + # declarations get lost + if c_node.nsDef is not NULL: + c_child = c_node.children + while c_child is not NULL: + moveNodeToDocument(doc, doc._c_doc, c_child) + c_child = c_child.next + + # fix sibling links to/from child slice + if c_node.prev is NULL: + c_parent.children = c_node.children + else: + c_node.prev.next = c_node.children + c_node.children.prev = c_node.prev + if c_node.next is NULL: + c_parent.last = c_node.last + else: + c_node.next.prev = c_node.last + c_node.last.next = c_node.next + + # unlink c_node + c_node.children = c_node.last = NULL + c_node.parent = c_node.next = c_node.prev = NULL + return 0 + +cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node): + c_href = _getNs(c_attrib_node) + value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href) + try: + result = funicode(value) + finally: + tree.xmlFree(value) + return result + +cdef object _attributeValueFromNsName(xmlNode* c_element, + const_xmlChar* c_href, const_xmlChar* c_name): + c_result = tree.xmlGetNsProp(c_element, c_name, c_href) + if c_result is NULL: + return None + try: + result = funicode(c_result) + finally: + tree.xmlFree(c_result) + return result + +cdef object _getNodeAttributeValue(xmlNode* c_node, key, default): + ns, tag = _getNsTag(key) + c_href = NULL if ns is None else _xcstr(ns) + c_result = tree.xmlGetNsProp(c_node, _xcstr(tag), c_href) + if c_result is NULL: + # XXX free namespace that is not in use..? + return default + try: + result = funicode(c_result) + finally: + tree.xmlFree(c_result) + return result + +cdef inline object _getAttributeValue(_Element element, key, default): + return _getNodeAttributeValue(element._c_node, key, default) + +cdef int _setAttributeValue(_Element element, key, value) except -1: + cdef xmlNs* c_ns + ns, tag = _getNsTag(key) + if not element._doc._parser._for_html: + _attributeValidOrRaise(tag) + c_tag = _xcstr(tag) + if isinstance(value, QName): + value = _resolveQNameText(element, value) + else: + value = _utf8(value) + c_value = _xcstr(value) + if ns is None: + c_ns = NULL + else: + c_ns = element._doc._findOrBuildNodeNs(element._c_node, _xcstr(ns), NULL, 1) + tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value) + return 0 + +cdef int _delAttribute(_Element element, key) except -1: + ns, tag = _getNsTag(key) + c_href = NULL if ns is None else _xcstr(ns) + if _delAttributeFromNsName(element._c_node, c_href, _xcstr(tag)): + raise KeyError, key + return 0 + +cdef int _delAttributeFromNsName(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name): + c_attr = tree.xmlHasNsProp(c_node, c_name, c_href) + if c_attr is NULL: + # XXX free namespace that is not in use..? + return -1 + tree.xmlRemoveProp(c_attr) + return 0 + +cdef list _collectAttributes(xmlNode* c_node, int collecttype): + u"""Collect all attributes of a node in a list. Depending on collecttype, + it collects either the name (1), the value (2) or the name-value tuples. + """ + cdef Py_ssize_t count + c_attr = c_node.properties + count = 0 + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: + count += 1 + c_attr = c_attr.next + + if not count: + return [] + + attributes = [None] * count + c_attr = c_node.properties + count = 0 + while c_attr is not NULL: + if c_attr.type == tree.XML_ATTRIBUTE_NODE: + if collecttype == 1: + item = _namespacedName(c_attr) + elif collecttype == 2: + item = _attributeValue(c_node, c_attr) + else: + item = (_namespacedName(c_attr), + _attributeValue(c_node, c_attr)) + attributes[count] = item + count += 1 + c_attr = c_attr.next + return attributes + +cdef object __RE_XML_ENCODING = re.compile( + ur'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) + +cdef object __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub +cdef object __HAS_XML_ENCODING = __RE_XML_ENCODING.match + +cdef object _stripEncodingDeclaration(object xml_string): + # this is a hack to remove the XML encoding declaration from unicode + return __REPLACE_XML_ENCODING(ur'\g<1>\g<2>', xml_string) + +cdef bint _hasEncodingDeclaration(object xml_string) except -1: + # check if a (unicode) string has an XML encoding declaration + return __HAS_XML_ENCODING(xml_string) is not None + +cdef inline bint _hasText(xmlNode* c_node): + return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL + +cdef inline bint _hasTail(xmlNode* c_node): + return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL + +cdef _collectText(xmlNode* c_node): + u"""Collect all text nodes and return them as a unicode string. + + Start collecting at c_node. + + If there was no text to collect, return None + """ + cdef Py_ssize_t scount + cdef xmlChar* c_text + cdef xmlNode* c_node_cur + # check for multiple text nodes + scount = 0 + c_text = NULL + c_node_cur = c_node = _textNodeOrSkip(c_node) + while c_node_cur is not NULL: + if c_node_cur.content[0] != c'\0': + c_text = c_node_cur.content + scount += 1 + c_node_cur = _textNodeOrSkip(c_node_cur.next) + + # handle two most common cases first + if c_text is NULL: + return '' if scount > 0 else None + if scount == 1: + return funicode(c_text) + + # the rest is not performance critical anymore + result = b'' + while c_node is not NULL: + result += c_node.content + c_node = _textNodeOrSkip(c_node.next) + return funicode(result) + +cdef void _removeText(xmlNode* c_node): + u"""Remove all text nodes. + + Start removing at c_node. + """ + cdef xmlNode* c_next + c_node = _textNodeOrSkip(c_node) + while c_node is not NULL: + c_next = _textNodeOrSkip(c_node.next) + tree.xmlUnlinkNode(c_node) + tree.xmlFreeNode(c_node) + c_node = c_next + +cdef int _setNodeText(xmlNode* c_node, value) except -1: + cdef xmlNode* c_text_node + # remove all text nodes at the start first + _removeText(c_node.children) + if value is None: + return 0 + # now add new text node with value at start + if python._isString(value): + text = _utf8(value) + c_text_node = tree.xmlNewDocText(c_node.doc, _xcstr(text)) + elif isinstance(value, CDATA): + c_text_node = tree.xmlNewCDataBlock( + c_node.doc, _xcstr((value)._utf8_data), + python.PyBytes_GET_SIZE((value)._utf8_data)) + else: + # this will raise the right error + _utf8(value) + return -1 + if c_node.children is NULL: + tree.xmlAddChild(c_node, c_text_node) + else: + tree.xmlAddPrevSibling(c_node.children, c_text_node) + return 0 + +cdef int _setTailText(xmlNode* c_node, value) except -1: + cdef xmlNode* c_text_node + # remove all text nodes at the start first + _removeText(c_node.next) + if value is None: + return 0 + text = _utf8(value) + c_text_node = tree.xmlNewDocText(c_node.doc, _xcstr(text)) + # XXX what if we're the top element? + tree.xmlAddNextSibling(c_node, c_text_node) + return 0 + +cdef bytes _resolveQNameText(_Element element, value): + cdef xmlNs* c_ns + ns, tag = _getNsTag(value) + if ns is None: + return tag + else: + c_ns = element._doc._findOrBuildNodeNs( + element._c_node, _xcstr(ns), NULL, 0) + return python.PyBytes_FromFormat('%s:%s', c_ns.prefix, _cstr(tag)) + +cdef inline bint _hasChild(xmlNode* c_node): + return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL + +cdef inline Py_ssize_t _countElements(xmlNode* c_node): + u"Counts the elements within the following siblings and the node itself." + cdef Py_ssize_t count + count = 0 + while c_node is not NULL: + if _isElement(c_node): + count += 1 + c_node = c_node.next + return count + +cdef int _findChildSlice( + slice sliceobject, xmlNode* c_parent, + xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1: + u"""Resolve a children slice. + + Returns the start node, step size and the slice length in the + pointer arguments. + """ + cdef Py_ssize_t start = 0, stop = 0, childcount + childcount = _countElements(c_parent.children) + if childcount == 0: + c_start_node[0] = NULL + c_length[0] = 0 + if sliceobject.step is None: + c_step[0] = 1 + else: + python._PyEval_SliceIndex(sliceobject.step, c_step) + return 0 + python.PySlice_GetIndicesEx( + sliceobject, childcount, &start, &stop, c_step, c_length) + if start > childcount / 2: + c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1) + else: + c_start_node[0] = _findChild(c_parent, start) + return 0 + +cdef bint _isFullSlice(slice sliceobject) except -1: + u"""Conservative guess if this slice is a full slice as in ``s[:]``. + """ + cdef Py_ssize_t step = 0 + if sliceobject is None: + return 0 + if sliceobject.start is None and \ + sliceobject.stop is None: + if sliceobject.step is None: + return 1 + python._PyEval_SliceIndex(sliceobject.step, &step) + if step == 1: + return 1 + return 0 + return 0 + +cdef _collectChildren(_Element element): + cdef xmlNode* c_node + cdef list result = [] + c_node = element._c_node.children + if c_node is not NULL: + if not _isElement(c_node): + c_node = _nextElement(c_node) + while c_node is not NULL: + result.append(_elementFactory(element._doc, c_node)) + c_node = _nextElement(c_node) + return result + +cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index): + if index < 0: + return _findChildBackwards(c_node, -index - 1) + else: + return _findChildForwards(c_node, index) + +cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index): + u"""Return child element of c_node with index, or return NULL if not found. + """ + cdef xmlNode* c_child + cdef Py_ssize_t c + c_child = c_node.children + c = 0 + while c_child is not NULL: + if _isElement(c_child): + if c == index: + return c_child + c += 1 + c_child = c_child.next + return NULL + +cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index): + u"""Return child element of c_node with index, or return NULL if not found. + Search from the end. + """ + cdef xmlNode* c_child + cdef Py_ssize_t c + c_child = c_node.last + c = 0 + while c_child is not NULL: + if _isElement(c_child): + if c == index: + return c_child + c += 1 + c_child = c_child.prev + return NULL + +cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) nogil: + u"""Return the node if it's a text node. Skip over ignorable nodes in a + series of text nodes. Return NULL if a non-ignorable node is found. + + This is used to skip over XInclude nodes when collecting adjacent text + nodes. + """ + while c_node is not NULL: + if c_node.type == tree.XML_TEXT_NODE or \ + c_node.type == tree.XML_CDATA_SECTION_NODE: + return c_node + elif c_node.type == tree.XML_XINCLUDE_START or \ + c_node.type == tree.XML_XINCLUDE_END: + c_node = c_node.next + else: + return NULL + return NULL + +cdef inline xmlNode* _nextElement(xmlNode* c_node): + u"""Given a node, find the next sibling that is an element. + """ + if c_node is NULL: + return NULL + c_node = c_node.next + while c_node is not NULL: + if _isElement(c_node): + return c_node + c_node = c_node.next + return NULL + +cdef inline xmlNode* _previousElement(xmlNode* c_node): + u"""Given a node, find the next sibling that is an element. + """ + if c_node is NULL: + return NULL + c_node = c_node.prev + while c_node is not NULL: + if _isElement(c_node): + return c_node + c_node = c_node.prev + return NULL + +cdef inline xmlNode* _parentElement(xmlNode* c_node): + u"Given a node, find the parent element." + if c_node is NULL or not _isElement(c_node): + return NULL + c_node = c_node.parent + if c_node is NULL or not _isElement(c_node): + return NULL + return c_node + +cdef inline bint _tagMatches(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name): + u"""Tests if the node matches namespace URI and tag name. + + A node matches if it matches both c_href and c_name. + + A node matches c_href if any of the following is true: + * c_href is NULL + * its namespace is NULL and c_href is the empty string + * its namespace string equals the c_href string + + A node matches c_name if any of the following is true: + * c_name is NULL + * its name string equals the c_name string + """ + if c_node is NULL: + return 0 + if c_node.type != tree.XML_ELEMENT_NODE: + # not an element, only succeed if we match everything + return c_name is NULL and c_href is NULL + if c_name is NULL: + if c_href is NULL: + # always match + return 1 + else: + c_node_href = _getNs(c_node) + if c_node_href is NULL: + return c_href[0] == c'\0' + else: + return tree.xmlStrcmp(c_node_href, c_href) == 0 + elif c_href is NULL: + if _getNs(c_node) is not NULL: + return 0 + return c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0 + elif c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0: + c_node_href = _getNs(c_node) + if c_node_href is NULL: + return c_href[0] == c'\0' + else: + return tree.xmlStrcmp(c_node_href, c_href) == 0 + else: + return 0 + +cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname): + u"""Tests if the node matches namespace URI and tag name. + + This differs from _tagMatches() in that it does not consider a + NULL value in qname.href a wildcard, and that it expects the c_name + to be taken from the doc dict, i.e. it only compares the names by + address. + + A node matches if it matches both href and c_name of the qname. + + A node matches c_href if any of the following is true: + * its namespace is NULL and c_href is the empty string + * its namespace string equals the c_href string + + A node matches c_name if any of the following is true: + * c_name is NULL + * its name string points to the same address (!) as c_name + """ + return _nsTagMatchesExactly(_getNs(c_node), c_node.name, c_qname) + +cdef inline bint _nsTagMatchesExactly(const_xmlChar* c_node_href, + const_xmlChar* c_node_name, + qname* c_qname): + u"""Tests if name and namespace URI match those of c_qname. + + This differs from _tagMatches() in that it does not consider a + NULL value in qname.href a wildcard, and that it expects the c_name + to be taken from the doc dict, i.e. it only compares the names by + address. + + A node matches if it matches both href and c_name of the qname. + + A node matches c_href if any of the following is true: + * its namespace is NULL and c_href is the empty string + * its namespace string equals the c_href string + + A node matches c_name if any of the following is true: + * c_name is NULL + * its name string points to the same address (!) as c_name + """ + cdef char* c_href + if c_qname.c_name is not NULL and c_qname.c_name is not c_node_name: + return 0 + if c_qname.href is NULL: + return 1 + c_href = python.__cstr(c_qname.href) + if c_href[0] == '\0': + return c_node_href is NULL or c_node_href[0] == '\0' + elif c_node_href is NULL: + return 0 + else: + return tree.xmlStrcmp(c_href, c_node_href) == 0 + +cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags, + qname* c_ns_tags, bint force_into_dict) except -1: + u"""Map a sequence of (name, namespace) pairs to a qname array for efficient + matching with _tagMatchesExactly() above. + + Note that each qname struct in the array owns its href byte string object + if it is not NULL. + """ + cdef Py_ssize_t count = 0, i + cdef bytes ns, tag + for ns, tag in ns_tags: + if tag is None: + c_tag = NULL + elif force_into_dict: + c_tag = tree.xmlDictLookup(c_doc.dict, _xcstr(tag), len(tag)) + if c_tag is NULL: + # clean up before raising the error + for i in xrange(count): + cpython.ref.Py_XDECREF(c_ns_tags[i].href) + raise MemoryError() + else: + c_tag = tree.xmlDictExists(c_doc.dict, _xcstr(tag), len(tag)) + if c_tag is NULL: + # not in the dict => not in the document + continue + c_ns_tags[count].c_name = c_tag + if ns is None: + c_ns_tags[count].href = NULL + else: + cpython.ref.Py_INCREF(ns) # keep an owned reference! + c_ns_tags[count].href = ns + count += 1 + return count + +cdef int _removeNode(_Document doc, xmlNode* c_node) except -1: + u"""Unlink and free a node and subnodes if possible. Otherwise, make sure + it's self-contained. + """ + cdef xmlNode* c_next + c_next = c_node.next + tree.xmlUnlinkNode(c_node) + _moveTail(c_next, c_node) + if not attemptDeallocation(c_node): + # make namespaces absolute + moveNodeToDocument(doc, c_node.doc, c_node) + return 0 + +cdef int _removeSiblings(xmlNode* c_element, tree.xmlElementType node_type, bint with_tail) except -1: + cdef xmlNode* c_node + cdef xmlNode* c_next + c_node = c_element.next + while c_node is not NULL: + c_next = _nextElement(c_node) + if c_node.type == node_type: + if with_tail: + _removeText(c_node.next) + tree.xmlUnlinkNode(c_node) + attemptDeallocation(c_node) + c_node = c_next + c_node = c_element.prev + while c_node is not NULL: + c_next = _previousElement(c_node) + if c_node.type == node_type: + if with_tail: + _removeText(c_node.next) + tree.xmlUnlinkNode(c_node) + attemptDeallocation(c_node) + c_node = c_next + return 0 + +cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target): + cdef xmlNode* c_next + # tail support: look for any text nodes trailing this node and + # move them too + c_tail = _textNodeOrSkip(c_tail) + while c_tail is not NULL: + c_next = _textNodeOrSkip(c_tail.next) + c_target = tree.xmlAddNextSibling(c_target, c_tail) + c_tail = c_next + +cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1: + cdef xmlNode* c_new_tail + # tail copying support: look for any text nodes trailing this node and + # copy it to the target node + c_tail = _textNodeOrSkip(c_tail) + while c_tail is not NULL: + if c_target.doc is not c_tail.doc: + c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0) + else: + c_new_tail = tree.xmlCopyNode(c_tail, 0) + if c_new_tail is NULL: + raise MemoryError() + c_target = tree.xmlAddNextSibling(c_target, c_new_tail) + c_tail = _textNodeOrSkip(c_tail.next) + return 0 + +cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1: + cdef xmlNode* c_copy + cdef xmlNode* c_sibling = c_node + while c_sibling.prev != NULL and \ + (c_sibling.prev.type == tree.XML_PI_NODE or \ + c_sibling.prev.type == tree.XML_COMMENT_NODE): + c_sibling = c_sibling.prev + while c_sibling != c_node: + c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) + if c_copy is NULL: + raise MemoryError() + tree.xmlAddPrevSibling(c_target, c_copy) + c_sibling = c_sibling.next + while c_sibling.next != NULL and \ + (c_sibling.next.type == tree.XML_PI_NODE or \ + c_sibling.next.type == tree.XML_COMMENT_NODE): + c_sibling = c_sibling.next + c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1) + if c_copy is NULL: + raise MemoryError() + tree.xmlAddNextSibling(c_target, c_copy) + +cdef int _deleteSlice(_Document doc, xmlNode* c_node, + Py_ssize_t count, Py_ssize_t step) except -1: + u"""Delete slice, ``count`` items starting with ``c_node`` with a step + width of ``step``. + """ + cdef xmlNode* c_next + cdef Py_ssize_t c, i + cdef _node_to_node_function next_element + if c_node is NULL: + return 0 + if step > 0: + next_element = _nextElement + else: + step = -step + next_element = _previousElement + # now start deleting nodes + c = 0 + c_next = c_node + while c_node is not NULL and c < count: + for i in range(step): + c_next = next_element(c_next) + _removeNode(doc, c_node) + c += 1 + c_node = c_next + return 0 + +cdef int _replaceSlice(_Element parent, xmlNode* c_node, + Py_ssize_t slicelength, Py_ssize_t step, + bint left_to_right, elements) except -1: + u"""Replace the slice of ``count`` elements starting at ``c_node`` with + positive step width ``step`` by the Elements in ``elements``. The + direction is given by the boolean argument ``left_to_right``. + + ``c_node`` may be NULL to indicate the end of the children list. + """ + cdef xmlNode* c_orig_neighbour + cdef xmlNode* c_next + cdef xmlDoc* c_source_doc + cdef _Element element + cdef Py_ssize_t seqlength, i, c + cdef _node_to_node_function next_element + assert step > 0 + if left_to_right: + next_element = _nextElement + else: + next_element = _previousElement + + if not isinstance(elements, (list, tuple)): + elements = list(elements) + + if step > 1: + # *replacing* children stepwise with list => check size! + seqlength = len(elements) + if seqlength != slicelength: + raise ValueError, u"attempt to assign sequence of size %d " \ + u"to extended slice of size %d" % (seqlength, slicelength) + + if c_node is NULL: + # no children yet => add all elements straight away + if left_to_right: + for element in elements: + assert element is not None, u"Node must not be None" + _appendChild(parent, element) + else: + for element in elements: + assert element is not None, u"Node must not be None" + _prependChild(parent, element) + return 0 + + # remove the elements first as some might be re-added + if left_to_right: + # L->R, remember left neighbour + c_orig_neighbour = _previousElement(c_node) + else: + # R->L, remember right neighbour + c_orig_neighbour = _nextElement(c_node) + + # We remove the original slice elements one by one. Since we hold + # a Python reference to all elements that we will insert, it is + # safe to let _removeNode() try (and fail) to free them even if + # the element itself or one of its descendents will be reinserted. + c = 0 + c_next = c_node + while c_node is not NULL and c < slicelength: + for i in range(step): + c_next = next_element(c_next) + _removeNode(parent._doc, c_node) + c += 1 + c_node = c_next + + # make sure each element is inserted only once + elements = iter(elements) + + # find the first node right of the new insertion point + if left_to_right: + if c_orig_neighbour is not NULL: + c_node = next_element(c_orig_neighbour) + else: + # before the first element + c_node = _findChildForwards(parent._c_node, 0) + elif c_orig_neighbour is NULL: + # at the end, but reversed stepping + # append one element and go to the next insertion point + for element in elements: + assert element is not None, u"Node must not be None" + _appendChild(parent, element) + c_node = element._c_node + if slicelength > 0: + slicelength -= 1 + for i in range(1, step): + c_node = next_element(c_node) + break + + if left_to_right: + # adjust step size after removing slice as we are not stepping + # over the newly inserted elements + step -= 1 + + # now insert elements where we removed them + if c_node is not NULL: + for element in elements: + assert element is not None, u"Node must not be None" + _assertValidNode(element) + # move element and tail over + c_source_doc = element._c_node.doc + c_next = element._c_node.next + tree.xmlAddPrevSibling(c_node, element._c_node) + _moveTail(c_next, element._c_node) + + # integrate element into new document + moveNodeToDocument(parent._doc, c_source_doc, element._c_node) + + # stop at the end of the slice + if slicelength > 0: + slicelength = slicelength - 1 + for i in range(step): + c_node = next_element(c_node) + if c_node is NULL: + break + else: + # everything inserted + return 0 + + # append the remaining elements at the respective end + if left_to_right: + for element in elements: + assert element is not None, u"Node must not be None" + _assertValidNode(element) + _appendChild(parent, element) + else: + for element in elements: + assert element is not None, u"Node must not be None" + _assertValidNode(element) + _prependChild(parent, element) + + return 0 + +cdef int _appendChild(_Element parent, _Element child) except -1: + u"""Append a new child to a parent element. + """ + c_node = child._c_node + c_source_doc = c_node.doc + # prevent cycles + c_parent = parent._c_node + while c_parent: + if c_parent is c_node: + raise ValueError("cannot append parent to itself") + c_parent = c_parent.parent + # store possible text node + c_next = c_node.next + # move node itself + tree.xmlUnlinkNode(c_node) + tree.xmlAddChild(parent._c_node, c_node) + _moveTail(c_next, c_node) + # uh oh, elements may be pointing to different doc when + # parent element has moved; change them too.. + moveNodeToDocument(parent._doc, c_source_doc, c_node) + return 0 + +cdef int _prependChild(_Element parent, _Element child) except -1: + u"""Prepend a new child to a parent element. + """ + c_node = child._c_node + c_source_doc = c_node.doc + # prevent cycles + c_parent = parent._c_node + while c_parent: + if c_parent is c_node: + raise ValueError("cannot append parent to itself") + c_parent = c_parent.parent + # store possible text node + c_next = c_node.next + # move node itself + c_child = _findChildForwards(parent._c_node, 0) + if c_child is NULL: + tree.xmlUnlinkNode(c_node) + tree.xmlAddChild(parent._c_node, c_node) + else: + tree.xmlAddPrevSibling(c_child, c_node) + _moveTail(c_next, c_node) + # uh oh, elements may be pointing to different doc when + # parent element has moved; change them too.. + moveNodeToDocument(parent._doc, c_source_doc, c_node) + return 0 + +cdef int _appendSibling(_Element element, _Element sibling) except -1: + u"""Add a new sibling behind an element. + """ + c_node = sibling._c_node + if element._c_node is c_node: + return 0 # nothing to do + c_source_doc = c_node.doc + # store possible text node + c_next = c_node.next + # move node itself + tree.xmlAddNextSibling(element._c_node, c_node) + _moveTail(c_next, c_node) + # uh oh, elements may be pointing to different doc when + # parent element has moved; change them too.. + moveNodeToDocument(element._doc, c_source_doc, c_node) + return 0 + +cdef int _prependSibling(_Element element, _Element sibling) except -1: + u"""Add a new sibling before an element. + """ + c_node = sibling._c_node + if element._c_node is c_node: + return 0 # nothing to do + c_source_doc = c_node.doc + # store possible text node + c_next = c_node.next + # move node itself + tree.xmlAddPrevSibling(element._c_node, c_node) + _moveTail(c_next, c_node) + # uh oh, elements may be pointing to different doc when + # parent element has moved; change them too.. + moveNodeToDocument(element._doc, c_source_doc, c_node) + return 0 + +cdef inline int isutf8(const_xmlChar* s): + cdef xmlChar c = s[0] + while c != c'\0': + if c & 0x80: + return 1 + s += 1 + c = s[0] + return 0 + +cdef int check_string_utf8(bytes pystring): + u"""Check if a string looks like valid UTF-8 XML content. Returns 0 + for ASCII, 1 for UTF-8 and -1 in the case of errors, such as NULL + bytes or ASCII control characters. + """ + cdef const_xmlChar* s = _xcstr(pystring) + cdef const_xmlChar* c_end = s + len(pystring) + cdef bint is_non_ascii = 0 + while s < c_end: + if s[0] & 0x80: + # skip over multi byte sequences + while s < c_end and s[0] & 0x80: + s += 1 + is_non_ascii = 1 + if s < c_end and not tree.xmlIsChar_ch(s[0]): + return -1 # invalid! + s += 1 + return is_non_ascii + +cdef inline object funicodeOrNone(const_xmlChar* s): + return funicode(s) if s is not NULL else None + +cdef inline object funicodeOrEmpty(const_xmlChar* s): + return funicode(s) if s is not NULL else '' + +cdef object funicode(const_xmlChar* s): + cdef Py_ssize_t slen + cdef const_xmlChar* spos + cdef bint is_non_ascii + if python.LXML_UNICODE_STRINGS: + return s.decode('UTF-8') + spos = s + is_non_ascii = 0 + while spos[0] != c'\0': + if spos[0] & 0x80: + is_non_ascii = 1 + break + spos += 1 + slen = spos - s + if spos[0] != c'\0': + slen += tree.xmlStrlen(spos) + if is_non_ascii: + return s[:slen].decode('UTF-8') + return s[:slen] + +cdef bytes _utf8(object s): + """Test if a string is valid user input and encode it to UTF-8. + Reject all bytes/unicode input that contains non-XML characters. + Reject all bytes input that contains non-ASCII characters. + """ + cdef int invalid + cdef bytes utf8_string + if not python.IS_PYTHON3 and type(s) is bytes: + utf8_string = s + invalid = check_string_utf8(utf8_string) + elif isinstance(s, unicode): + utf8_string = (s).encode('utf8') + invalid = check_string_utf8(utf8_string) == -1 # non-XML? + elif isinstance(s, (bytes, bytearray)): + utf8_string = bytes(s) + invalid = check_string_utf8(utf8_string) + else: + raise TypeError("Argument must be bytes or unicode, got '%.200s'" % type(s).__name__) + if invalid: + raise ValueError( + "All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters") + return utf8_string + +cdef bytes _utf8orNone(object s): + return _utf8(s) if s is not None else None + +cdef bint _isFilePath(const_xmlChar* c_path): + u"simple heuristic to see if a path is a filename" + cdef xmlChar c + # test if it looks like an absolute Unix path or a Windows network path + if c_path[0] == c'/': + return 1 + + # test if it looks like an absolute Windows path or URL + if (c_path[0] >= c'a' and c_path[0] <= c'z') or \ + (c_path[0] >= c'A' and c_path[0] <= c'Z'): + c_path += 1 + if c_path[0] == c':' and c_path[1] in b'\0\\': + return 1 # C: or C:\... + + # test if it looks like a URL with scheme:// + while (c_path[0] >= c'a' and c_path[0] <= c'z') or \ + (c_path[0] >= c'A' and c_path[0] <= c'Z'): + c_path += 1 + if c_path[0] == c':' and c_path[1] == c'/' and c_path[2] == c'/': + return 0 + + # assume it's a relative path + return 1 + +cdef object _encodeFilename(object filename): + u"""Make sure a filename is 8-bit encoded (or None). + """ + if filename is None: + return None + elif isinstance(filename, bytes): + return filename + elif isinstance(filename, unicode): + filename8 = (filename).encode('utf8') + if _isFilePath(filename8): + try: + return python.PyUnicode_AsEncodedString( + filename, _C_FILENAME_ENCODING, NULL) + except UnicodeEncodeError: + pass + return filename8 + else: + raise TypeError("Argument must be string or unicode.") + +cdef object _decodeFilename(const_xmlChar* c_path): + u"""Make the filename a unicode string if we are in Py3. + """ + return _decodeFilenameWithLength(c_path, tree.xmlStrlen(c_path)) + +cdef object _decodeFilenameWithLength(const_xmlChar* c_path, size_t c_len): + u"""Make the filename a unicode string if we are in Py3. + """ + if _isFilePath(c_path): + try: + return python.PyUnicode_Decode( + c_path, c_len, _C_FILENAME_ENCODING, NULL) + except UnicodeDecodeError: + pass + try: + return (c_path)[:c_len].decode('UTF-8') + except UnicodeDecodeError: + # this is a stupid fallback, but it might still work... + return (c_path)[:c_len].decode('latin-1', 'replace') + +cdef object _encodeFilenameUTF8(object filename): + u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and + UTF-8 as source encoding. + """ + cdef char* c_filename + if filename is None: + return None + elif isinstance(filename, bytes): + if not check_string_utf8(filename): + # plain ASCII! + return filename + c_filename = _cstr(filename) + try: + # try to decode with default encoding + filename = python.PyUnicode_Decode( + c_filename, len(filename), + _C_FILENAME_ENCODING, NULL) + except UnicodeDecodeError as decode_exc: + try: + # try if it's proper UTF-8 + (filename).decode('utf8') + return filename + except UnicodeDecodeError: + raise decode_exc # otherwise re-raise original exception + if isinstance(filename, unicode): + return (filename).encode('utf8') + else: + raise TypeError("Argument must be string or unicode.") + +cdef tuple _getNsTag(tag): + u"""Given a tag, find namespace URI and tag name. + Return None for NS uri if no namespace URI provided. + """ + return __getNsTag(tag, 0) + +cdef tuple _getNsTagWithEmptyNs(tag): + u"""Given a tag, find namespace URI and tag name. Return None for NS uri + if no namespace URI provided, or the empty string if namespace + part is '{}'. + """ + return __getNsTag(tag, 1) + +cdef tuple __getNsTag(tag, bint empty_ns): + cdef char* c_tag + cdef char* c_ns_end + cdef Py_ssize_t taglen + cdef Py_ssize_t nslen + cdef bytes ns = None + # _isString() is much faster than isinstance() + if not _isString(tag) and isinstance(tag, QName): + tag = (tag).text + tag = _utf8(tag) + c_tag = _cstr(tag) + if c_tag[0] == c'{': + c_tag += 1 + c_ns_end = cstring_h.strchr(c_tag, c'}') + if c_ns_end is NULL: + raise ValueError, u"Invalid tag name" + nslen = c_ns_end - c_tag + taglen = python.PyBytes_GET_SIZE(tag) - nslen - 2 + if taglen == 0: + raise ValueError, u"Empty tag name" + if nslen > 0: + ns = c_tag[:nslen] + elif empty_ns: + ns = b'' + tag = c_ns_end[1:taglen+1] + elif python.PyBytes_GET_SIZE(tag) == 0: + raise ValueError, u"Empty tag name" + return ns, tag + +cdef inline int _pyXmlNameIsValid(name_utf8): + return _xmlNameIsValid(_xcstr(name_utf8)) + +cdef inline int _pyHtmlNameIsValid(name_utf8): + return _htmlNameIsValid(_xcstr(name_utf8)) + +cdef inline int _xmlNameIsValid(const_xmlChar* c_name): + return tree.xmlValidateNCName(c_name, 0) == 0 + +cdef int _htmlNameIsValid(const_xmlChar* c_name): + if c_name is NULL or c_name[0] == c'\0': + return 0 + while c_name[0] != c'\0': + if c_name[0] in b'&<>/"\'\t\n\x0B\x0C\r ': + return 0 + c_name += 1 + return 1 + +cdef bint _characterReferenceIsValid(const_xmlChar* c_name): + cdef bint is_hex + if c_name[0] == c'x': + c_name += 1 + is_hex = 1 + else: + is_hex = 0 + if c_name[0] == c'\0': + return 0 + while c_name[0] != c'\0': + if c_name[0] < c'0' or c_name[0] > c'9': + if not is_hex: + return 0 + if not (c'a' <= c_name[0] <= c'f'): + if not (c'A' <= c_name[0] <= c'F'): + return 0 + c_name += 1 + return 1 + +cdef int _tagValidOrRaise(tag_utf) except -1: + if not _pyXmlNameIsValid(tag_utf): + raise ValueError(u"Invalid tag name %r" % + (tag_utf).decode('utf8')) + return 0 + +cdef int _htmlTagValidOrRaise(tag_utf) except -1: + if not _pyHtmlNameIsValid(tag_utf): + raise ValueError(u"Invalid HTML tag name %r" % + (tag_utf).decode('utf8')) + return 0 + +cdef int _attributeValidOrRaise(name_utf) except -1: + if not _pyXmlNameIsValid(name_utf): + raise ValueError(u"Invalid attribute name %r" % + (name_utf).decode('utf8')) + return 0 + +cdef int _prefixValidOrRaise(tag_utf) except -1: + if not _pyXmlNameIsValid(tag_utf): + raise ValueError(u"Invalid namespace prefix %r" % + (tag_utf).decode('utf8')) + return 0 + +cdef int _uriValidOrRaise(uri_utf) except -1: + cdef uri.xmlURI* c_uri = uri.xmlParseURI(_cstr(uri_utf)) + if c_uri is NULL: + raise ValueError(u"Invalid namespace URI %r" % + (uri_utf).decode('utf8')) + uri.xmlFreeURI(c_uri) + return 0 + +cdef inline object _namespacedName(xmlNode* c_node): + return _namespacedNameFromNsName(_getNs(c_node), c_node.name) + +cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name): + if href is NULL: + return funicode(name) + elif python.LXML_UNICODE_STRINGS and not python.IS_PYPY: + return python.PyUnicode_FromFormat("{%s}%s", href, name) + else: + s = python.PyBytes_FromFormat("{%s}%s", href, name) + if python.LXML_UNICODE_STRINGS or isutf8(_xcstr(s)): + return (s).decode('utf8') + else: + return s + +cdef _getFilenameForFile(source): + u"""Given a Python File or Gzip object, give filename back. + + Returns None if not a file object. + """ + # urllib2 provides a geturl() method + try: + return source.geturl() + except: + pass + # file instances have a name attribute + try: + filename = source.name + if _isString(filename): + return os_path_abspath(filename) + except: + pass + # gzip file instances have a filename attribute (before Py3k) + try: + filename = source.filename + if _isString(filename): + return os_path_abspath(filename) + except: + pass + # can't determine filename + return None diff --git a/lib/lxml/builder.py b/lib/lxml/builder.py new file mode 100644 index 00000000..ad61a80e --- /dev/null +++ b/lib/lxml/builder.py @@ -0,0 +1,238 @@ +# +# Element generator factory by Fredrik Lundh. +# +# Source: +# http://online.effbot.org/2006_11_01_archive.htm#et-builder +# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py +# +# -------------------------------------------------------------------- +# The ElementTree toolkit is +# +# Copyright (c) 1999-2004 by Fredrik Lundh +# +# By obtaining, using, and/or copying this software and/or its +# associated documentation, you agree that you have read, understood, +# and will comply with the following terms and conditions: +# +# Permission to use, copy, modify, and distribute this software and +# its associated documentation for any purpose and without fee is +# hereby granted, provided that the above copyright notice appears in +# all copies, and that both that copyright notice and this permission +# notice appear in supporting documentation, and that the name of +# Secret Labs AB or the author not be used in advertising or publicity +# pertaining to distribution of the software without specific, written +# prior permission. +# +# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD +# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- +# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR +# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY +# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. +# -------------------------------------------------------------------- + +""" +The ``E`` Element factory for generating XML documents. +""" + +import lxml.etree as ET + +try: + from functools import partial +except ImportError: + # fake it for pre-2.5 releases + def partial(func, tag): + return lambda *args, **kwargs: func(tag, *args, **kwargs) + +try: + callable +except NameError: + # Python 3 + def callable(f): + return hasattr(f, '__call__') + +try: + basestring +except NameError: + basestring = str + +try: + unicode +except NameError: + unicode = str + + +class ElementMaker(object): + """Element generator factory. + + Unlike the ordinary Element factory, the E factory allows you to pass in + more than just a tag and some optional attributes; you can also pass in + text and other elements. The text is added as either text or tail + attributes, and elements are inserted at the right spot. Some small + examples:: + + >>> from lxml import etree as ET + >>> from lxml.builder import E + + >>> ET.tostring(E("tag")) + '' + >>> ET.tostring(E("tag", "text")) + 'text' + >>> ET.tostring(E("tag", "text", key="value")) + 'text' + >>> ET.tostring(E("tag", E("subtag", "text"), "tail")) + 'texttail' + + For simple tags, the factory also allows you to write ``E.tag(...)`` instead + of ``E('tag', ...)``:: + + >>> ET.tostring(E.tag()) + '' + >>> ET.tostring(E.tag("text")) + 'text' + >>> ET.tostring(E.tag(E.subtag("text"), "tail")) + 'texttail' + + Here's a somewhat larger example; this shows how to generate HTML + documents, using a mix of prepared factory functions for inline elements, + nested ``E.tag`` calls, and embedded XHTML fragments:: + + # some common inline elements + A = E.a + I = E.i + B = E.b + + def CLASS(v): + # helper function, 'class' is a reserved word + return {'class': v} + + page = ( + E.html( + E.head( + E.title("This is a sample document") + ), + E.body( + E.h1("Hello!", CLASS("title")), + E.p("This is a paragraph with ", B("bold"), " text in it!"), + E.p("This is another paragraph, with a ", + A("link", href="http://www.python.org"), "."), + E.p("Here are some reservered characters: ."), + ET.XML("

And finally, here is an embedded XHTML fragment.

"), + ) + ) + ) + + print ET.tostring(page) + + Here's a prettyprinted version of the output from the above script:: + + + + This is a sample document + + +

Hello!

+

This is a paragraph with bold text in it!

+

This is another paragraph, with link.

+

Here are some reservered characters: <spam&egg>.

+

And finally, here is an embedded XHTML fragment.

+ + + + For namespace support, you can pass a namespace map (``nsmap``) + and/or a specific target ``namespace`` to the ElementMaker class:: + + >>> E = ElementMaker(namespace="http://my.ns/") + >>> print(ET.tostring( E.test )) + + + >>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'}) + >>> print(ET.tostring( E.test )) + + """ + + def __init__(self, typemap=None, + namespace=None, nsmap=None, makeelement=None): + if namespace is not None: + self._namespace = '{' + namespace + '}' + else: + self._namespace = None + + if nsmap: + self._nsmap = dict(nsmap) + else: + self._nsmap = None + + if makeelement is not None: + assert callable(makeelement) + self._makeelement = makeelement + else: + self._makeelement = ET.Element + + # initialize type map for this element factory + + if typemap: + typemap = typemap.copy() + else: + typemap = {} + + def add_text(elem, item): + try: + elem[-1].tail = (elem[-1].tail or "") + item + except IndexError: + elem.text = (elem.text or "") + item + if str not in typemap: + typemap[str] = add_text + if unicode not in typemap: + typemap[unicode] = add_text + + def add_dict(elem, item): + attrib = elem.attrib + for k, v in item.items(): + if isinstance(v, basestring): + attrib[k] = v + else: + attrib[k] = typemap[type(v)](None, v) + if dict not in typemap: + typemap[dict] = add_dict + + self._typemap = typemap + + def __call__(self, tag, *children, **attrib): + get = self._typemap.get + + if self._namespace is not None and tag[0] != '{': + tag = self._namespace + tag + elem = self._makeelement(tag, nsmap=self._nsmap) + if attrib: + get(dict)(elem, attrib) + + for item in children: + if callable(item): + item = item() + t = get(type(item)) + if t is None: + if ET.iselement(item): + elem.append(item) + continue + for basetype in type(item).__mro__: + # See if the typemap knows of any of this type's bases. + t = get(basetype) + if t is not None: + break + else: + raise TypeError("bad argument type: %s(%r)" % + (type(item).__name__, item)) + v = t(elem, item) + if v: + get(type(v))(elem, v) + + return elem + + def __getattr__(self, tag): + return partial(self, tag) + +# create factory object +E = ElementMaker() diff --git a/lib/lxml/classlookup.pxi b/lib/lxml/classlookup.pxi new file mode 100644 index 00000000..82740a51 --- /dev/null +++ b/lib/lxml/classlookup.pxi @@ -0,0 +1,565 @@ +# Configurable Element class lookup + +################################################################################ +# Custom Element classes + +cdef public class ElementBase(_Element) [ type LxmlElementBaseType, + object LxmlElementBase ]: + u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) + + The public Element class. All custom Element classes must inherit + from this one. To create an Element, use the `Element()` factory. + + BIG FAT WARNING: Subclasses *must not* override __init__ or + __new__ as it is absolutely undefined when these objects will be + created or destroyed. All persistent state of Elements must be + stored in the underlying XML. If you really need to initialize + the object after creation, you can implement an ``_init(self)`` + method that will be called directly after object creation. + + Subclasses of this class can be instantiated to create a new + Element. By default, the tag name will be the class name and the + namespace will be empty. You can modify this with the following + class attributes: + + * TAG - the tag name, possibly containing a namespace in Clark + notation + + * NAMESPACE - the default namespace URI, unless provided as part + of the TAG attribute. + + * HTML - flag if the class is an HTML tag, as opposed to an XML + tag. This only applies to un-namespaced tags and defaults to + false (i.e. XML). + + * PARSER - the parser that provides the configuration for the + newly created document. Providing an HTML parser here will + default to creating an HTML element. + + In user code, the latter three are commonly inherited in class + hierarchies that implement a common namespace. + """ + def __init__(self, *children, attrib=None, nsmap=None, **_extra): + u"""ElementBase(*children, attrib=None, nsmap=None, **_extra) + """ + cdef bint is_html = 0 + cdef _BaseParser parser + cdef _Element last_child + # don't use normal attribute access as it might be overridden + _getattr = object.__getattribute__ + try: + namespace = _utf8(_getattr(self, 'NAMESPACE')) + except AttributeError: + namespace = None + try: + ns, tag = _getNsTag(_getattr(self, 'TAG')) + if ns is not None: + namespace = ns + except AttributeError: + tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__')) + if b'.' in tag: + tag = tag.split(b'.')[-1] + try: + parser = _getattr(self, 'PARSER') + except AttributeError: + parser = None + for child in children: + if isinstance(child, _Element): + parser = (<_Element>child)._doc._parser + break + if isinstance(parser, HTMLParser): + is_html = 1 + if namespace is None: + try: + is_html = _getattr(self, 'HTML') + except AttributeError: + pass + _initNewElement(self, is_html, tag, namespace, parser, + attrib, nsmap, _extra) + last_child = None + for child in children: + if _isString(child): + if last_child is None: + _setNodeText(self._c_node, + (_collectText(self._c_node.children) or '') + child) + else: + _setTailText(last_child._c_node, + (_collectText(last_child._c_node.next) or '') + child) + elif isinstance(child, _Element): + last_child = child + _appendChild(self, last_child) + elif isinstance(child, type) and issubclass(child, ElementBase): + last_child = child() + _appendChild(self, last_child) + else: + raise TypeError, "Invalid child type: %r" % type(child) + +cdef class CommentBase(_Comment): + u"""All custom Comment classes must inherit from this one. + + To create an XML Comment instance, use the ``Comment()`` factory. + + Subclasses *must not* override __init__ or __new__ as it is + absolutely undefined when these objects will be created or + destroyed. All persistent state of Comments must be stored in the + underlying XML. If you really need to initialize the object after + creation, you can implement an ``_init(self)`` method that will be + called after object creation. + """ + def __init__(self, text): + # copied from Comment() factory + cdef _Document doc + cdef xmlDoc* c_doc + if text is None: + text = b'' + else: + text = _utf8(text) + c_doc = _newXMLDoc() + doc = _documentFactory(c_doc, None) + self._c_node = _createComment(c_doc, _xcstr(text)) + if self._c_node is NULL: + raise MemoryError() + tree.xmlAddChild(c_doc, self._c_node) + _registerProxy(self, doc, self._c_node) + self._init() + +cdef class PIBase(_ProcessingInstruction): + u"""All custom Processing Instruction classes must inherit from this one. + + To create an XML ProcessingInstruction instance, use the ``PI()`` + factory. + + Subclasses *must not* override __init__ or __new__ as it is + absolutely undefined when these objects will be created or + destroyed. All persistent state of PIs must be stored in the + underlying XML. If you really need to initialize the object after + creation, you can implement an ``_init(self)`` method that will be + called after object creation. + """ + def __init__(self, target, text=None): + # copied from PI() factory + cdef _Document doc + cdef xmlDoc* c_doc + target = _utf8(target) + if text is None: + text = b'' + else: + text = _utf8(text) + c_doc = _newXMLDoc() + doc = _documentFactory(c_doc, None) + self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text)) + if self._c_node is NULL: + raise MemoryError() + tree.xmlAddChild(c_doc, self._c_node) + _registerProxy(self, doc, self._c_node) + self._init() + +cdef class EntityBase(_Entity): + u"""All custom Entity classes must inherit from this one. + + To create an XML Entity instance, use the ``Entity()`` factory. + + Subclasses *must not* override __init__ or __new__ as it is + absolutely undefined when these objects will be created or + destroyed. All persistent state of Entities must be stored in the + underlying XML. If you really need to initialize the object after + creation, you can implement an ``_init(self)`` method that will be + called after object creation. + """ + def __init__(self, name): + cdef _Document doc + cdef xmlDoc* c_doc + name_utf = _utf8(name) + c_name = _xcstr(name_utf) + if c_name[0] == c'#': + if not _characterReferenceIsValid(c_name + 1): + raise ValueError, u"Invalid character reference: '%s'" % name + elif not _xmlNameIsValid(c_name): + raise ValueError, u"Invalid entity reference: '%s'" % name + c_doc = _newXMLDoc() + doc = _documentFactory(c_doc, None) + self._c_node = _createEntity(c_doc, c_name) + if self._c_node is NULL: + raise MemoryError() + tree.xmlAddChild(c_doc, self._c_node) + _registerProxy(self, doc, self._c_node) + self._init() + + +cdef int _validateNodeClass(xmlNode* c_node, cls) except -1: + if c_node.type == tree.XML_ELEMENT_NODE: + expected = ElementBase + elif c_node.type == tree.XML_COMMENT_NODE: + expected = CommentBase + elif c_node.type == tree.XML_ENTITY_REF_NODE: + expected = EntityBase + elif c_node.type == tree.XML_PI_NODE: + expected = PIBase + else: + assert 0, u"Unknown node type: %s" % c_node.type + + if not (isinstance(cls, type) and issubclass(cls, expected)): + raise TypeError( + "result of class lookup must be subclass of %s, got %s" + % (type(expected), type(cls))) + return 0 + + +################################################################################ +# Element class lookup + +ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*) + +# class to store element class lookup functions +cdef public class ElementClassLookup [ type LxmlElementClassLookupType, + object LxmlElementClassLookup ]: + u"""ElementClassLookup(self) + Superclass of Element class lookups. + """ + cdef _element_class_lookup_function _lookup_function + def __cinit__(self): + self._lookup_function = NULL # use default lookup + +cdef public class FallbackElementClassLookup(ElementClassLookup) \ + [ type LxmlFallbackElementClassLookupType, + object LxmlFallbackElementClassLookup ]: + u"""FallbackElementClassLookup(self, fallback=None) + + Superclass of Element class lookups with additional fallback. + """ + cdef readonly ElementClassLookup fallback + cdef _element_class_lookup_function _fallback_function + def __cinit__(self): + # fall back to default lookup + self._fallback_function = _lookupDefaultElementClass + + def __init__(self, ElementClassLookup fallback=None): + if fallback is not None: + self._setFallback(fallback) + else: + self._fallback_function = _lookupDefaultElementClass + + cdef void _setFallback(self, ElementClassLookup lookup): + u"""Sets the fallback scheme for this lookup method. + """ + self.fallback = lookup + self._fallback_function = lookup._lookup_function + if self._fallback_function is NULL: + self._fallback_function = _lookupDefaultElementClass + + def set_fallback(self, ElementClassLookup lookup not None): + u"""set_fallback(self, lookup) + + Sets the fallback scheme for this lookup method. + """ + self._setFallback(lookup) + +cdef inline object _callLookupFallback(FallbackElementClassLookup lookup, + _Document doc, xmlNode* c_node): + return lookup._fallback_function(lookup.fallback, doc, c_node) + + +################################################################################ +# default lookup scheme + +cdef class ElementDefaultClassLookup(ElementClassLookup): + u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None) + Element class lookup scheme that always returns the default Element + class. + + The keyword arguments ``element``, ``comment``, ``pi`` and ``entity`` + accept the respective Element classes. + """ + cdef readonly object element_class + cdef readonly object comment_class + cdef readonly object pi_class + cdef readonly object entity_class + def __cinit__(self): + self._lookup_function = _lookupDefaultElementClass + + def __init__(self, element=None, comment=None, pi=None, entity=None): + if element is None: + self.element_class = _Element + elif issubclass(element, ElementBase): + self.element_class = element + else: + raise TypeError, u"element class must be subclass of ElementBase" + + if comment is None: + self.comment_class = _Comment + elif issubclass(comment, CommentBase): + self.comment_class = comment + else: + raise TypeError, u"comment class must be subclass of CommentBase" + + if entity is None: + self.entity_class = _Entity + elif issubclass(entity, EntityBase): + self.entity_class = entity + else: + raise TypeError, u"Entity class must be subclass of EntityBase" + + if pi is None: + self.pi_class = None # special case, see below + elif issubclass(pi, PIBase): + self.pi_class = pi + else: + raise TypeError, u"PI class must be subclass of PIBase" + +cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node): + u"Trivial class lookup function that always returns the default class." + if c_node.type == tree.XML_ELEMENT_NODE: + if state is not None: + return (state).element_class + else: + return _Element + elif c_node.type == tree.XML_COMMENT_NODE: + if state is not None: + return (state).comment_class + else: + return _Comment + elif c_node.type == tree.XML_ENTITY_REF_NODE: + if state is not None: + return (state).entity_class + else: + return _Entity + elif c_node.type == tree.XML_PI_NODE: + if state is None or (state).pi_class is None: + # special case XSLT-PI + if c_node.name is not NULL and c_node.content is not NULL: + if tree.xmlStrcmp(c_node.name, "xml-stylesheet") == 0: + if tree.xmlStrstr(c_node.content, "text/xsl") is not NULL or \ + tree.xmlStrstr(c_node.content, "text/xml") is not NULL: + return _XSLTProcessingInstruction + return _ProcessingInstruction + else: + return (state).pi_class + else: + assert 0, u"Unknown node type: %s" % c_node.type + + +################################################################################ +# attribute based lookup scheme + +cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup): + u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None) + Checks an attribute of an Element and looks up the value in a + class dictionary. + + Arguments: + - attribute name - '{ns}name' style string + - class mapping - Python dict mapping attribute values to Element classes + - fallback - optional fallback lookup mechanism + + A None key in the class mapping will be checked if the attribute is + missing. + """ + cdef object _class_mapping + cdef tuple _pytag + cdef const_xmlChar* _c_ns + cdef const_xmlChar* _c_name + def __cinit__(self): + self._lookup_function = _attribute_class_lookup + + def __init__(self, attribute_name, class_mapping, + ElementClassLookup fallback=None): + self._pytag = _getNsTag(attribute_name) + ns, name = self._pytag + if ns is None: + self._c_ns = NULL + else: + self._c_ns = _xcstr(ns) + self._c_name = _xcstr(name) + self._class_mapping = dict(class_mapping) + + FallbackElementClassLookup.__init__(self, fallback) + +cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node): + cdef AttributeBasedElementClassLookup lookup + cdef python.PyObject* dict_result + + lookup = state + if c_node.type == tree.XML_ELEMENT_NODE: + value = _attributeValueFromNsName( + c_node, lookup._c_ns, lookup._c_name) + dict_result = python.PyDict_GetItem(lookup._class_mapping, value) + if dict_result is not NULL: + cls = dict_result + _validateNodeClass(c_node, cls) + return cls + return _callLookupFallback(lookup, doc, c_node) + + +################################################################################ +# per-parser lookup scheme + +cdef class ParserBasedElementClassLookup(FallbackElementClassLookup): + u"""ParserBasedElementClassLookup(self, fallback=None) + Element class lookup based on the XML parser. + """ + def __cinit__(self): + self._lookup_function = _parser_class_lookup + +cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node): + if doc._parser._class_lookup is not None: + return doc._parser._class_lookup._lookup_function( + doc._parser._class_lookup, doc, c_node) + return _callLookupFallback(state, doc, c_node) + + +################################################################################ +# custom class lookup based on node type, namespace, name + +cdef class CustomElementClassLookup(FallbackElementClassLookup): + u"""CustomElementClassLookup(self, fallback=None) + Element class lookup based on a subclass method. + + You can inherit from this class and override the method:: + + lookup(self, type, doc, namespace, name) + + to lookup the element class for a node. Arguments of the method: + * type: one of 'element', 'comment', 'PI', 'entity' + * doc: document that the node is in + * namespace: namespace URI of the node (or None for comments/PIs/entities) + * name: name of the element/entity, None for comments, target for PIs + + If you return None from this method, the fallback will be called. + """ + def __cinit__(self): + self._lookup_function = _custom_class_lookup + + def lookup(self, type, doc, namespace, name): + u"lookup(self, type, doc, namespace, name)" + return None + +cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node): + cdef CustomElementClassLookup lookup + + lookup = state + + if c_node.type == tree.XML_ELEMENT_NODE: + element_type = u"element" + elif c_node.type == tree.XML_COMMENT_NODE: + element_type = u"comment" + elif c_node.type == tree.XML_PI_NODE: + element_type = u"PI" + elif c_node.type == tree.XML_ENTITY_REF_NODE: + element_type = u"entity" + else: + element_type = u"element" + if c_node.name is NULL: + name = None + else: + name = funicode(c_node.name) + c_str = tree._getNs(c_node) + ns = funicode(c_str) if c_str is not NULL else None + + cls = lookup.lookup(element_type, doc, ns, name) + if cls is not None: + _validateNodeClass(c_node, cls) + return cls + return _callLookupFallback(lookup, doc, c_node) + + +################################################################################ +# read-only tree based class lookup + +cdef class PythonElementClassLookup(FallbackElementClassLookup): + u"""PythonElementClassLookup(self, fallback=None) + Element class lookup based on a subclass method. + + This class lookup scheme allows access to the entire XML tree in + read-only mode. To use it, re-implement the ``lookup(self, doc, + root)`` method in a subclass:: + + from lxml import etree, pyclasslookup + + class MyElementClass(etree.ElementBase): + honkey = True + + class MyLookup(pyclasslookup.PythonElementClassLookup): + def lookup(self, doc, root): + if root.tag == "sometag": + return MyElementClass + else: + for child in root: + if child.tag == "someothertag": + return MyElementClass + # delegate to default + return None + + If you return None from this method, the fallback will be called. + + The first argument is the opaque document instance that contains + the Element. The second argument is a lightweight Element proxy + implementation that is only valid during the lookup. Do not try + to keep a reference to it. Once the lookup is done, the proxy + will be invalid. + + Also, you cannot wrap such a read-only Element in an ElementTree, + and you must take care not to keep a reference to them outside of + the `lookup()` method. + + Note that the API of the Element objects is not complete. It is + purely read-only and does not support all features of the normal + `lxml.etree` API (such as XPath, extended slicing or some + iteration methods). + + See http://codespeak.net/lxml/element_classes.html + """ + def __cinit__(self): + self._lookup_function = _python_class_lookup + + def lookup(self, doc, element): + u"""lookup(self, doc, element) + + Override this method to implement your own lookup scheme. + """ + return None + +cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node): + cdef PythonElementClassLookup lookup + cdef _ReadOnlyElementProxy proxy + lookup = state + + proxy = _newReadOnlyProxy(None, c_node) + cls = lookup.lookup(doc, proxy) + _freeReadOnlyProxies(proxy) + + if cls is not None: + _validateNodeClass(c_node, cls) + return cls + return _callLookupFallback(lookup, doc, c_node) + +################################################################################ +# Global setup + +cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS +cdef object ELEMENT_CLASS_LOOKUP_STATE + +cdef void _setElementClassLookupFunction( + _element_class_lookup_function function, object state): + global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE + if function is NULL: + state = DEFAULT_ELEMENT_CLASS_LOOKUP + function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function + + ELEMENT_CLASS_LOOKUP_STATE = state + LOOKUP_ELEMENT_CLASS = function + +def set_element_class_lookup(ElementClassLookup lookup = None): + u"""set_element_class_lookup(lookup = None) + + Set the global default element class lookup method. + """ + if lookup is None or lookup._lookup_function is NULL: + _setElementClassLookupFunction(NULL, None) + else: + _setElementClassLookupFunction(lookup._lookup_function, lookup) + +# default setup: parser delegation +cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP +DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup() + +set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP) diff --git a/lib/lxml/cleanup.pxi b/lib/lxml/cleanup.pxi new file mode 100644 index 00000000..5388ffb6 --- /dev/null +++ b/lib/lxml/cleanup.pxi @@ -0,0 +1,210 @@ +# functions for tree cleanup and removing elements from subtrees + +def cleanup_namespaces(tree_or_element): + u"""cleanup_namespaces(tree_or_element) + + Remove all namespace declarations from a subtree that are not used + by any of the elements or attributes in that tree. + """ + cdef _Element element + element = _rootNodeOrRaise(tree_or_element) + _removeUnusedNamespaceDeclarations(element._c_node) + +def strip_attributes(tree_or_element, *attribute_names): + u"""strip_attributes(tree_or_element, *attribute_names) + + Delete all attributes with the provided attribute names from an + Element (or ElementTree) and its descendants. + + Attribute names can contain wildcards as in `_Element.iter`. + + Example usage:: + + strip_attributes(root_element, + 'simpleattr', + '{http://some/ns}attrname', + '{http://other/ns}*') + """ + cdef _MultiTagMatcher matcher + cdef _Element element + + element = _rootNodeOrRaise(tree_or_element) + if not attribute_names: + return + + matcher = _MultiTagMatcher(attribute_names) + matcher.cacheTags(element._doc) + if matcher.rejectsAllAttributes(): + return + _strip_attributes(element._c_node, matcher) + +cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher): + cdef xmlAttr* c_attr + cdef xmlAttr* c_next_attr + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) + if c_node.type == tree.XML_ELEMENT_NODE: + c_attr = c_node.properties + while c_attr is not NULL: + c_next_attr = c_attr.next + if matcher.matchesAttribute(c_attr): + tree.xmlRemoveProp(c_attr) + c_attr = c_next_attr + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + +def strip_elements(tree_or_element, *tag_names, bint with_tail=True): + u"""strip_elements(tree_or_element, *tag_names, with_tail=True) + + Delete all elements with the provided tag names from a tree or + subtree. This will remove the elements and their entire subtree, + including all their attributes, text content and descendants. It + will also remove the tail text of the element unless you + explicitly set the ``with_tail`` keyword argument option to False. + + Tag names can contain wildcards as in `_Element.iter`. + + Note that this will not delete the element (or ElementTree root + element) that you passed even if it matches. It will only treat + its descendants. If you want to include the root element, check + its tag name directly before even calling this function. + + Example usage:: + + strip_elements(some_element, + 'simpletagname', # non-namespaced tag + '{http://some/ns}tagname', # namespaced tag + '{http://some/other/ns}*' # any tag from a namespace + lxml.etree.Comment # comments + ) + """ + cdef _MultiTagMatcher matcher + cdef _Element element + cdef _Document doc + cdef list ns_tags + cdef qname* c_ns_tags + cdef Py_ssize_t c_tag_count + cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0 + + doc = _documentOrRaise(tree_or_element) + element = _rootNodeOrRaise(tree_or_element) + if not tag_names: + return + + matcher = _MultiTagMatcher(tag_names) + matcher.cacheTags(doc) + if matcher.rejectsAll(): + return + + if isinstance(tree_or_element, _ElementTree): + # include PIs and comments next to the root node + if matcher.matchesType(tree.XML_COMMENT_NODE): + _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail) + if matcher.matchesType(tree.XML_PI_NODE): + _removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail) + _strip_elements(doc, element._c_node, matcher, with_tail) + +cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher, + bint with_tail): + cdef xmlNode* c_child + cdef xmlNode* c_next + + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) + if c_node.type == tree.XML_ELEMENT_NODE: + # we run through the children here to prevent any problems + # with the tree iteration which would occur if we unlinked the + # c_node itself + c_child = _findChildForwards(c_node, 0) + while c_child is not NULL: + c_next = _nextElement(c_child) + if matcher.matches(c_child): + if c_child.type == tree.XML_ELEMENT_NODE: + if not with_tail: + tree.xmlUnlinkNode(c_child) + _removeNode(doc, c_child) + else: + if with_tail: + _removeText(c_child.next) + tree.xmlUnlinkNode(c_child) + attemptDeallocation(c_child) + c_child = c_next + tree.END_FOR_EACH_ELEMENT_FROM(c_node) + + +def strip_tags(tree_or_element, *tag_names): + u"""strip_tags(tree_or_element, *tag_names) + + Delete all elements with the provided tag names from a tree or + subtree. This will remove the elements and their attributes, but + *not* their text/tail content or descendants. Instead, it will + merge the text content and children of the element into its + parent. + + Tag names can contain wildcards as in `_Element.iter`. + + Note that this will not delete the element (or ElementTree root + element) that you passed even if it matches. It will only treat + its descendants. + + Example usage:: + + strip_tags(some_element, + 'simpletagname', # non-namespaced tag + '{http://some/ns}tagname', # namespaced tag + '{http://some/other/ns}*' # any tag from a namespace + Comment # comments (including their text!) + ) + """ + cdef _MultiTagMatcher matcher + cdef _Element element + cdef _Document doc + cdef list ns_tags + cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0 + cdef char** c_ns_tags + cdef Py_ssize_t c_tag_count + + doc = _documentOrRaise(tree_or_element) + element = _rootNodeOrRaise(tree_or_element) + if not tag_names: + return + + matcher = _MultiTagMatcher(tag_names) + matcher.cacheTags(doc) + if matcher.rejectsAll(): + return + + if isinstance(tree_or_element, _ElementTree): + # include PIs and comments next to the root node + if matcher.matchesType(tree.XML_COMMENT_NODE): + _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0) + if matcher.matchesType(tree.XML_PI_NODE): + _removeSiblings(element._c_node, tree.XML_PI_NODE, 0) + _strip_tags(doc, element._c_node, matcher) + +cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher): + cdef xmlNode* c_child + cdef xmlNode* c_next + cdef Py_ssize_t i + + tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1) + if c_node.type == tree.XML_ELEMENT_NODE: + # we run through the children here to prevent any problems + # with the tree iteration which would occur if we unlinked the + # c_node itself + c_child = _findChildForwards(c_node, 0) + while c_child is not NULL: + if not matcher.matches(c_child): + c_child = _nextElement(c_child) + continue + if c_child.type == tree.XML_ELEMENT_NODE: + c_next = _findChildForwards(c_child, 0) or _nextElement(c_child) + _replaceNodeByChildren(doc, c_child) + if not attemptDeallocation(c_child): + if c_child.nsDef is not NULL: + # make namespaces absolute + moveNodeToDocument(doc, doc._c_doc, c_child) + c_child = c_next + else: + c_next = _nextElement(c_child) + tree.xmlUnlinkNode(c_child) + attemptDeallocation(c_child) + c_child = c_next + tree.END_FOR_EACH_ELEMENT_FROM(c_node) diff --git a/lib/lxml/cssselect.py b/lib/lxml/cssselect.py new file mode 100644 index 00000000..e8effaa2 --- /dev/null +++ b/lib/lxml/cssselect.py @@ -0,0 +1,103 @@ +"""CSS Selectors based on XPath. + +This module supports selecting XML/HTML tags based on CSS selectors. +See the `CSSSelector` class for details. + +This is a thin wrapper around cssselect 0.7 or later. +""" + +import sys +from lxml import etree + +## Work-around the lack of absolute import in Python 2.4 +#from __future__ import absolute_import +#from cssselect import ... +try: + external_cssselect = __import__('cssselect') +except ImportError: + raise ImportError('cssselect seems not to be installed. ' + 'See http://packages.python.org/cssselect/') + +SelectorSyntaxError = external_cssselect.SelectorSyntaxError +ExpressionError = external_cssselect.ExpressionError +SelectorError = external_cssselect.SelectorError + + +__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError', + 'CSSSelector'] + + +class LxmlTranslator(external_cssselect.GenericTranslator): + """ + A custom CSS selector to XPath translator with lxml-specific extensions. + """ + def xpath_contains_function(self, xpath, function): + # Defined there, removed in later drafts: + # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors + if function.argument_types() not in (['STRING'], ['IDENT']): + raise ExpressionError( + "Expected a single string or ident for :contains(), got %r" + % function.arguments) + value = function.arguments[0].value + return xpath.add_condition( + 'contains(__lxml_internal_css:lower-case(string(.)), %s)' + % self.xpath_literal(value.lower())) + + +class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator): + """ + lxml extensions + HTML support. + """ + + +def _make_lower_case(context, s): + return s.lower() + +ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/') +ns.prefix = '__lxml_internal_css' +ns['lower-case'] = _make_lower_case + + +class CSSSelector(etree.XPath): + """A CSS selector. + + Usage:: + + >>> from lxml import etree, cssselect + >>> select = cssselect.CSSSelector("a tag > child") + + >>> root = etree.XML("TEXT") + >>> [ el.tag for el in select(root) ] + ['child'] + + To use CSS namespaces, you need to pass a prefix-to-namespace + mapping as ``namespaces`` keyword argument:: + + >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' + >>> select_ns = cssselect.CSSSelector('root > rdf|Description', + ... namespaces={'rdf': rdfns}) + + >>> rdf = etree.XML(( + ... '' + ... 'blah' + ... '') % rdfns) + >>> [(el.tag, el.text) for el in select_ns(rdf)] + [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')] + + """ + def __init__(self, css, namespaces=None, translator='xml'): + if translator == 'xml': + translator = LxmlTranslator() + elif translator == 'html': + translator = LxmlHTMLTranslator() + elif translator == 'xhtml': + translator = LxmlHTMLTranslator(xhtml=True) + path = translator.css_to_xpath(css) + etree.XPath.__init__(self, path, namespaces=namespaces) + self.css = css + + def __repr__(self): + return '<%s %s for %r>' % ( + self.__class__.__name__, + hex(abs(id(self)))[2:], + self.css) diff --git a/lib/lxml/cvarargs.pxd b/lib/lxml/cvarargs.pxd new file mode 100644 index 00000000..824c1f0c --- /dev/null +++ b/lib/lxml/cvarargs.pxd @@ -0,0 +1,8 @@ +cdef extern from "stdarg.h": + ctypedef void *va_list + void va_start(va_list ap, void *last) nogil + void va_end(va_list ap) nogil + +cdef extern from "etree_defs.h": + cdef int va_int(va_list ap) nogil + cdef char *va_charptr(va_list ap) nogil diff --git a/lib/lxml/debug.pxi b/lib/lxml/debug.pxi new file mode 100644 index 00000000..47b8497b --- /dev/null +++ b/lib/lxml/debug.pxi @@ -0,0 +1,91 @@ + +@cython.final +@cython.internal +cdef class _MemDebug: + """Debugging support for the memory allocation in libxml2. + """ + def bytes_used(self): + """bytes_used(self) + + Returns the total amount of memory (in bytes) currently used by libxml2. + Note that libxml2 constrains this value to a C int, which limits + the accuracy on 64 bit systems. + """ + return tree.xmlMemUsed() + + def blocks_used(self): + """blocks_used(self) + + Returns the total number of memory blocks currently allocated by libxml2. + Note that libxml2 constrains this value to a C int, which limits + the accuracy on 64 bit systems. + """ + return tree.xmlMemBlocks() + + def dict_size(self): + """dict_size(self) + + Returns the current size of the global name dictionary used by libxml2 + for the current thread. Each thread has its own dictionary. + """ + c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) + if c_dict is NULL: + raise MemoryError() + return tree.xmlDictSize(c_dict) + + def dump(self, output_file=None, byte_count=None): + """dump(self, output_file=None, byte_count=None) + + Dumps the current memory blocks allocated by libxml2 to a file. + + The optional parameter 'output_file' specifies the file path. It defaults + to the file ".memorylist" in the current directory. + + The optional parameter 'byte_count' limits the number of bytes in the dump. + Note that this parameter is ignored when lxml is compiled against a libxml2 + version before 2.7.0. + """ + cdef Py_ssize_t c_count + if output_file is None: + output_file = b'.memorylist' + elif isinstance(output_file, unicode): + output_file.encode(sys.getfilesystemencoding()) + + f = stdio.fopen(output_file, "w") + if f is NULL: + raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding())) + try: + if byte_count is None: + tree.xmlMemDisplay(f) + else: + c_count = byte_count + tree.xmlMemDisplayLast(f, c_count) + finally: + stdio.fclose(f) + + def show(self, output_file=None, block_count=None): + """show(self, output_file=None, block_count=None) + + Dumps the current memory blocks allocated by libxml2 to a file. + The output file format is suitable for line diffing. + + The optional parameter 'output_file' specifies the file path. It defaults + to the file ".memorydump" in the current directory. + + The optional parameter 'block_count' limits the number of blocks + in the dump. + """ + if output_file is None: + output_file = b'.memorydump' + elif isinstance(output_file, unicode): + output_file.encode(sys.getfilesystemencoding()) + + f = stdio.fopen(output_file, "w") + if f is NULL: + raise IOError("Failed to create file %s" % output_file.decode(sys.getfilesystemencoding())) + try: + tree.xmlMemShow(f, block_count if block_count is not None else tree.xmlMemBlocks()) + finally: + stdio.fclose(f) + +memory_debugger = _MemDebug() diff --git a/lib/lxml/docloader.pxi b/lib/lxml/docloader.pxi new file mode 100644 index 00000000..dd515b7f --- /dev/null +++ b/lib/lxml/docloader.pxi @@ -0,0 +1,175 @@ +# Custom resolver API + +ctypedef enum _InputDocumentDataType: + PARSER_DATA_INVALID + PARSER_DATA_EMPTY + PARSER_DATA_STRING + PARSER_DATA_FILENAME + PARSER_DATA_FILE + +@cython.final +@cython.internal +cdef class _InputDocument: + cdef _InputDocumentDataType _type + cdef bytes _data_bytes + cdef object _filename + cdef object _file + cdef bint _close_file + + def __cinit__(self): + self._type = PARSER_DATA_INVALID + + +cdef class Resolver: + u"This is the base class of all resolvers." + def resolve(self, system_url, public_id, context): + u"""resolve(self, system_url, public_id, context) + + Override this method to resolve an external source by + ``system_url`` and ``public_id``. The third argument is an + opaque context object. + + Return the result of one of the ``resolve_*()`` methods. + """ + return None + + def resolve_empty(self, context): + u"""resolve_empty(self, context) + + Return an empty input document. + + Pass context as parameter. + """ + cdef _InputDocument doc_ref + doc_ref = _InputDocument() + doc_ref._type = PARSER_DATA_EMPTY + return doc_ref + + def resolve_string(self, string, context, *, base_url=None): + u"""resolve_string(self, string, context, base_url=None) + + Return a parsable string as input document. + + Pass data string and context as parameters. You can pass the + source URL or filename through the ``base_url`` keyword + argument. + """ + cdef _InputDocument doc_ref + if isinstance(string, unicode): + string = (string).encode('utf8') + elif not isinstance(string, bytes): + raise TypeError, "argument must be a byte string or unicode string" + doc_ref = _InputDocument() + doc_ref._type = PARSER_DATA_STRING + doc_ref._data_bytes = string + if base_url is not None: + doc_ref._filename = _encodeFilename(base_url) + return doc_ref + + def resolve_filename(self, filename, context): + u"""resolve_filename(self, filename, context) + + Return the name of a parsable file as input document. + + Pass filename and context as parameters. You can also pass a + URL with an HTTP, FTP or file target. + """ + cdef _InputDocument doc_ref + doc_ref = _InputDocument() + doc_ref._type = PARSER_DATA_FILENAME + doc_ref._filename = _encodeFilename(filename) + return doc_ref + + def resolve_file(self, f, context, *, base_url=None, bint close=True): + u"""resolve_file(self, f, context, base_url=None, close=True) + + Return an open file-like object as input document. + + Pass open file and context as parameters. You can pass the + base URL or filename of the file through the ``base_url`` + keyword argument. If the ``close`` flag is True (the + default), the file will be closed after reading. + + Note that using ``.resolve_filename()`` is more efficient, + especially in threaded environments. + """ + cdef _InputDocument doc_ref + try: + f.read + except AttributeError: + raise TypeError, u"Argument is not a file-like object" + doc_ref = _InputDocument() + doc_ref._type = PARSER_DATA_FILE + if base_url is not None: + doc_ref._filename = _encodeFilename(base_url) + else: + doc_ref._filename = _getFilenameForFile(f) + doc_ref._close_file = close + doc_ref._file = f + return doc_ref + +@cython.final +@cython.internal +cdef class _ResolverRegistry: + cdef object _resolvers + cdef Resolver _default_resolver + def __cinit__(self, Resolver default_resolver=None): + self._resolvers = set() + self._default_resolver = default_resolver + + def add(self, Resolver resolver not None): + u"""add(self, resolver) + + Register a resolver. + + For each requested entity, the 'resolve' method of the resolver will + be called and the result will be passed to the parser. If this method + returns None, the request will be delegated to other resolvers or the + default resolver. The resolvers will be tested in an arbitrary order + until the first match is found. + """ + self._resolvers.add(resolver) + + def remove(self, resolver): + u"remove(self, resolver)" + self._resolvers.discard(resolver) + + cdef _ResolverRegistry _copy(self): + cdef _ResolverRegistry registry + registry = _ResolverRegistry(self._default_resolver) + registry._resolvers = self._resolvers.copy() + return registry + + def copy(self): + u"copy(self)" + return self._copy() + + def resolve(self, system_url, public_id, context): + u"resolve(self, system_url, public_id, context)" + for resolver in self._resolvers: + result = resolver.resolve(system_url, public_id, context) + if result is not None: + return result + if self._default_resolver is None: + return None + return self._default_resolver.resolve(system_url, public_id, context) + + def __repr__(self): + return repr(self._resolvers) + +@cython.internal +cdef class _ResolverContext(_ExceptionContext): + cdef _ResolverRegistry _resolvers + cdef _TempStore _storage + + cdef void clear(self): + _ExceptionContext.clear(self) + self._storage.clear() + +cdef _initResolverContext(_ResolverContext context, + _ResolverRegistry resolvers): + if resolvers is None: + context._resolvers = _ResolverRegistry() + else: + context._resolvers = resolvers + context._storage = _TempStore() diff --git a/lib/lxml/doctestcompare.py b/lib/lxml/doctestcompare.py new file mode 100644 index 00000000..3cd5ce48 --- /dev/null +++ b/lib/lxml/doctestcompare.py @@ -0,0 +1,505 @@ +""" +lxml-based doctest output comparison. + +Note: normally, you should just import the `lxml.usedoctest` and +`lxml.html.usedoctest` modules from within a doctest, instead of this +one:: + + >>> import lxml.usedoctest # for XML output + + >>> import lxml.html.usedoctest # for HTML output + +To use this module directly, you must call ``lxmldoctest.install()``, +which will cause doctest to use this in all subsequent calls. + +This changes the way output is checked and comparisons are made for +XML or HTML-like content. + +XML or HTML content is noticed because the example starts with ``<`` +(it's HTML if it starts with ```` or include an ``any`` +attribute in the tag. An ``any`` tag matches any tag, while the +attribute matches any and all attributes. + +When a match fails, the reformatted example and gotten text is +displayed (indented), and a rough diff-like output is given. Anything +marked with ``-`` is in the output but wasn't supposed to be, and +similarly ``+`` means its in the example but wasn't in the output. + +You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP`` +""" + +from lxml import etree +import sys +import re +import doctest +import cgi + +__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker', + 'LHTMLOutputChecker', 'install', 'temp_install'] + +try: + _basestring = basestring +except NameError: + _basestring = (str, bytes) + +_IS_PYTHON_3 = sys.version_info[0] >= 3 + +PARSE_HTML = doctest.register_optionflag('PARSE_HTML') +PARSE_XML = doctest.register_optionflag('PARSE_XML') +NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP') + +OutputChecker = doctest.OutputChecker + +def strip(v): + if v is None: + return None + else: + return v.strip() + +def norm_whitespace(v): + return _norm_whitespace_re.sub(' ', v) + +_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True) + +def html_fromstring(html): + return etree.fromstring(html, _html_parser) + +# We use this to distinguish repr()s from elements: +_repr_re = re.compile(r'^<[^>]+ (at|object) ') +_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') + +class LXMLOutputChecker(OutputChecker): + + empty_tags = ( + 'param', 'img', 'area', 'br', 'basefont', 'input', + 'base', 'meta', 'link', 'col') + + def get_default_parser(self): + return etree.XML + + def check_output(self, want, got, optionflags): + alt_self = getattr(self, '_temp_override_self', None) + if alt_self is not None: + super_method = self._temp_call_super_check_output + self = alt_self + else: + super_method = OutputChecker.check_output + parser = self.get_parser(want, got, optionflags) + if not parser: + return super_method( + self, want, got, optionflags) + try: + want_doc = parser(want) + except etree.XMLSyntaxError: + return False + try: + got_doc = parser(got) + except etree.XMLSyntaxError: + return False + return self.compare_docs(want_doc, got_doc) + + def get_parser(self, want, got, optionflags): + parser = None + if NOPARSE_MARKUP & optionflags: + return None + if PARSE_HTML & optionflags: + parser = html_fromstring + elif PARSE_XML & optionflags: + parser = etree.XML + elif (want.strip().lower().startswith('' % el.tag + return '<%s %s>' % (el.tag, ' '.join(attrs)) + + def format_end_tag(self, el): + if isinstance(el, etree.CommentBase): + # FIXME: probably PIs should be handled specially too? + return '-->' + return '' % el.tag + + def collect_diff(self, want, got, html, indent): + parts = [] + if not len(want) and not len(got): + parts.append(' '*indent) + parts.append(self.collect_diff_tag(want, got)) + if not self.html_empty_tag(got, html): + parts.append(self.collect_diff_text(want.text, got.text)) + parts.append(self.collect_diff_end_tag(want, got)) + parts.append(self.collect_diff_text(want.tail, got.tail)) + parts.append('\n') + return ''.join(parts) + parts.append(' '*indent) + parts.append(self.collect_diff_tag(want, got)) + parts.append('\n') + if strip(want.text) or strip(got.text): + parts.append(' '*indent) + parts.append(self.collect_diff_text(want.text, got.text)) + parts.append('\n') + want_children = list(want) + got_children = list(got) + while want_children or got_children: + if not want_children: + parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) + continue + if not got_children: + parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) + continue + parts.append(self.collect_diff( + want_children.pop(0), got_children.pop(0), html, indent+2)) + parts.append(' '*indent) + parts.append(self.collect_diff_end_tag(want, got)) + parts.append('\n') + if strip(want.tail) or strip(got.tail): + parts.append(' '*indent) + parts.append(self.collect_diff_text(want.tail, got.tail)) + parts.append('\n') + return ''.join(parts) + + def collect_diff_tag(self, want, got): + if not self.tag_compare(want.tag, got.tag): + tag = '%s (got: %s)' % (want.tag, got.tag) + else: + tag = got.tag + attrs = [] + any = want.tag == 'any' or 'any' in want.attrib + for name, value in sorted(got.attrib.items()): + if name not in want.attrib and not any: + attrs.append('-%s="%s"' % (name, self.format_text(value, False))) + else: + if name in want.attrib: + text = self.collect_diff_text(want.attrib[name], value, False) + else: + text = self.format_text(value, False) + attrs.append('%s="%s"' % (name, text)) + if not any: + for name, value in sorted(want.attrib.items()): + if name in got.attrib: + continue + attrs.append('+%s="%s"' % (name, self.format_text(value, False))) + if attrs: + tag = '<%s %s>' % (tag, ' '.join(attrs)) + else: + tag = '<%s>' % tag + return tag + + def collect_diff_end_tag(self, want, got): + if want.tag != got.tag: + tag = '%s (got: %s)' % (want.tag, got.tag) + else: + tag = got.tag + return '' % tag + + def collect_diff_text(self, want, got, strip=True): + if self.text_compare(want, got, strip): + if not got: + return '' + return self.format_text(got, strip) + text = '%s (got: %s)' % (want, got) + return self.format_text(text, strip) + +class LHTMLOutputChecker(LXMLOutputChecker): + def get_default_parser(self): + return html_fromstring + +def install(html=False): + """ + Install doctestcompare for all future doctests. + + If html is true, then by default the HTML parser will be used; + otherwise the XML parser is used. + """ + if html: + doctest.OutputChecker = LHTMLOutputChecker + else: + doctest.OutputChecker = LXMLOutputChecker + +def temp_install(html=False, del_module=None): + """ + Use this *inside* a doctest to enable this checker for this + doctest only. + + If html is true, then by default the HTML parser will be used; + otherwise the XML parser is used. + """ + if html: + Checker = LHTMLOutputChecker + else: + Checker = LXMLOutputChecker + frame = _find_doctest_frame() + dt_self = frame.f_locals['self'] + checker = Checker() + old_checker = dt_self._checker + dt_self._checker = checker + # The unfortunate thing is that there is a local variable 'check' + # in the function that runs the doctests, that is a bound method + # into the output checker. We have to update that. We can't + # modify the frame, so we have to modify the object in place. The + # only way to do this is to actually change the func_code + # attribute of the method. We change it, and then wait for + # __record_outcome to be run, which signals the end of the __run + # method, at which point we restore the previous check_output + # implementation. + if _IS_PYTHON_3: + check_func = frame.f_locals['check'].__func__ + checker_check_func = checker.check_output.__func__ + else: + check_func = frame.f_locals['check'].im_func + checker_check_func = checker.check_output.im_func + # Because we can't patch up func_globals, this is the only global + # in check_output that we care about: + doctest.etree = etree + _RestoreChecker(dt_self, old_checker, checker, + check_func, checker_check_func, + del_module) + +class _RestoreChecker(object): + def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, + del_module): + self.dt_self = dt_self + self.checker = old_checker + self.checker._temp_call_super_check_output = self.call_super + self.checker._temp_override_self = new_checker + self.check_func = check_func + self.clone_func = clone_func + self.del_module = del_module + self.install_clone() + self.install_dt_self() + def install_clone(self): + if _IS_PYTHON_3: + self.func_code = self.check_func.__code__ + self.func_globals = self.check_func.__globals__ + self.check_func.__code__ = self.clone_func.__code__ + else: + self.func_code = self.check_func.func_code + self.func_globals = self.check_func.func_globals + self.check_func.func_code = self.clone_func.func_code + def uninstall_clone(self): + if _IS_PYTHON_3: + self.check_func.__code__ = self.func_code + else: + self.check_func.func_code = self.func_code + def install_dt_self(self): + self.prev_func = self.dt_self._DocTestRunner__record_outcome + self.dt_self._DocTestRunner__record_outcome = self + def uninstall_dt_self(self): + self.dt_self._DocTestRunner__record_outcome = self.prev_func + def uninstall_module(self): + if self.del_module: + import sys + del sys.modules[self.del_module] + if '.' in self.del_module: + package, module = self.del_module.rsplit('.', 1) + package_mod = sys.modules[package] + delattr(package_mod, module) + def __call__(self, *args, **kw): + self.uninstall_clone() + self.uninstall_dt_self() + del self.checker._temp_override_self + del self.checker._temp_call_super_check_output + result = self.prev_func(*args, **kw) + self.uninstall_module() + return result + def call_super(self, *args, **kw): + self.uninstall_clone() + try: + return self.check_func(*args, **kw) + finally: + self.install_clone() + +def _find_doctest_frame(): + import sys + frame = sys._getframe(1) + while frame: + l = frame.f_locals + if 'BOOM' in l: + # Sign of doctest + return frame + frame = frame.f_back + raise LookupError( + "Could not find doctest (only use this function *inside* a doctest)") + +__test__ = { + 'basic': ''' + >>> temp_install() + >>> print """stuff""" + ... + >>> print """""" + + + + >>> print """blahblahblah""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS + ...foo /> + '''} + +if __name__ == '__main__': + import doctest + doctest.testmod() + + diff --git a/lib/lxml/dtd.pxi b/lib/lxml/dtd.pxi new file mode 100644 index 00000000..d1913b42 --- /dev/null +++ b/lib/lxml/dtd.pxi @@ -0,0 +1,468 @@ +# support for DTD validation +from lxml.includes cimport dtdvalid + +class DTDError(LxmlError): + u"""Base class for DTD errors. + """ + pass + +class DTDParseError(DTDError): + u"""Error while parsing a DTD. + """ + pass + +class DTDValidateError(DTDError): + u"""Error while validating an XML document with a DTD. + """ + pass + +cdef inline int _assertValidDTDNode(node, void *c_node) except -1: + assert c_node is not NULL, u"invalid DTD proxy at %s" % id(node) + + +@cython.final +@cython.internal +@cython.freelist(8) +cdef class _DTDElementContentDecl: + cdef DTD _dtd + cdef tree.xmlElementContent* _c_node + + def __repr__(self): + return "<%s.%s object name=%r type=%r occur=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.type, self.occur, id(self)) + + property name: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.name) if self._c_node.name is not NULL else None + + property type: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + cdef int type = self._c_node.type + if type == tree.XML_ELEMENT_CONTENT_PCDATA: + return "pcdata" + elif type == tree.XML_ELEMENT_CONTENT_ELEMENT: + return "element" + elif type == tree.XML_ELEMENT_CONTENT_SEQ: + return "seq" + elif type == tree.XML_ELEMENT_CONTENT_OR: + return "or" + else: + return None + + property occur: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + cdef int occur = self._c_node.ocur + if occur == tree.XML_ELEMENT_CONTENT_ONCE: + return "once" + elif occur == tree.XML_ELEMENT_CONTENT_OPT: + return "opt" + elif occur == tree.XML_ELEMENT_CONTENT_MULT: + return "mult" + elif occur == tree.XML_ELEMENT_CONTENT_PLUS: + return "plus" + else: + return None + + property left: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + c1 = self._c_node.c1 + if c1: + node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl) + node._dtd = self._dtd + node._c_node = c1 + return node + else: + return None + + property right: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + c2 = self._c_node.c2 + if c2: + node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl) + node._dtd = self._dtd + node._c_node = c2 + return node + else: + return None + + +@cython.final +@cython.internal +@cython.freelist(8) +cdef class _DTDAttributeDecl: + cdef DTD _dtd + cdef tree.xmlAttribute* _c_node + + def __repr__(self): + return "<%s.%s object name=%r elemname=%r prefix=%r type=%r default=%r default_value=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.elemname, self.prefix, self.type, self.default, self.default_value, id(self)) + + property name: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.name) if self._c_node.name is not NULL else None + + property elemname: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.elem) if self._c_node.elem is not NULL else None + + property prefix: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None + + property type: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + cdef int type = self._c_node.atype + if type == tree.XML_ATTRIBUTE_CDATA: + return "cdata" + elif type == tree.XML_ATTRIBUTE_ID: + return "id" + elif type == tree.XML_ATTRIBUTE_IDREF: + return "idref" + elif type == tree.XML_ATTRIBUTE_IDREFS: + return "idrefs" + elif type == tree.XML_ATTRIBUTE_ENTITY: + return "entity" + elif type == tree.XML_ATTRIBUTE_ENTITIES: + return "entities" + elif type == tree.XML_ATTRIBUTE_NMTOKEN: + return "nmtoken" + elif type == tree.XML_ATTRIBUTE_NMTOKENS: + return "nmtokens" + elif type == tree.XML_ATTRIBUTE_ENUMERATION: + return "enumeration" + elif type == tree.XML_ATTRIBUTE_NOTATION: + return "notation" + else: + return None + + property default: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + cdef int default = self._c_node.def_ + if default == tree.XML_ATTRIBUTE_NONE: + return "none" + elif default == tree.XML_ATTRIBUTE_REQUIRED: + return "required" + elif default == tree.XML_ATTRIBUTE_IMPLIED: + return "implied" + elif default == tree.XML_ATTRIBUTE_FIXED: + return "fixed" + else: + return None + + property default_value: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.defaultValue) if self._c_node.defaultValue is not NULL else None + + def itervalues(self): + _assertValidDTDNode(self, self._c_node) + cdef tree.xmlEnumeration *c_node = self._c_node.tree + while c_node is not NULL: + yield funicode(c_node.name) + c_node = c_node.next + + def values(self): + return list(self.itervalues()) + + +@cython.final +@cython.internal +@cython.freelist(8) +cdef class _DTDElementDecl: + cdef DTD _dtd + cdef tree.xmlElement* _c_node + + def __repr__(self): + return "<%s.%s object name=%r prefix=%r type=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.prefix, self.type, id(self)) + + property name: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.name) if self._c_node.name is not NULL else None + + property prefix: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.prefix) if self._c_node.prefix is not NULL else None + + property type: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + cdef int type = self._c_node.etype + if type == tree.XML_ELEMENT_TYPE_UNDEFINED: + return "undefined" + elif type == tree.XML_ELEMENT_TYPE_EMPTY: + return "empty" + elif type == tree.XML_ELEMENT_TYPE_ANY: + return "any" + elif type == tree.XML_ELEMENT_TYPE_MIXED: + return "mixed" + elif type == tree.XML_ELEMENT_TYPE_ELEMENT: + return "element" + else: + return None + + property content: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + cdef tree.xmlElementContent *content = self._c_node.content + if content: + node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl) + node._dtd = self._dtd + node._c_node = content + return node + else: + return None + + def iterattributes(self): + _assertValidDTDNode(self, self._c_node) + cdef tree.xmlAttribute *c_node = self._c_node.attributes + while c_node: + node = <_DTDAttributeDecl>_DTDAttributeDecl.__new__(_DTDAttributeDecl) + node._dtd = self._dtd + node._c_node = c_node + yield node + c_node = c_node.nexth + + def attributes(self): + return list(self.iterattributes()) + + +@cython.final +@cython.internal +@cython.freelist(8) +cdef class _DTDEntityDecl: + cdef DTD _dtd + cdef tree.xmlEntity* _c_node + def __repr__(self): + return "<%s.%s object name=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self)) + + property name: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.name) if self._c_node.name is not NULL else None + + property orig: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.orig) if self._c_node.orig is not NULL else None + + property content: + def __get__(self): + _assertValidDTDNode(self, self._c_node) + return funicode(self._c_node.content) if self._c_node.content is not NULL else None + + +################################################################################ +# DTD + +cdef class DTD(_Validator): + u"""DTD(self, file=None, external_id=None) + A DTD validator. + + Can load from filesystem directly given a filename or file-like object. + Alternatively, pass the keyword parameter ``external_id`` to load from a + catalog. + """ + cdef tree.xmlDtd* _c_dtd + def __init__(self, file=None, *, external_id=None): + _Validator.__init__(self) + if file is not None: + if _isString(file): + file = _encodeFilename(file) + with self._error_log: + self._c_dtd = xmlparser.xmlParseDTD(NULL, _xcstr(file)) + elif hasattr(file, 'read'): + self._c_dtd = _parseDtdFromFilelike(file) + else: + raise DTDParseError, u"file must be a filename or file-like object" + elif external_id is not None: + with self._error_log: + self._c_dtd = xmlparser.xmlParseDTD(external_id, NULL) + else: + raise DTDParseError, u"either filename or external ID required" + + if self._c_dtd is NULL: + raise DTDParseError( + self._error_log._buildExceptionMessage(u"error parsing DTD"), + self._error_log) + + property name: + def __get__(self): + if self._c_dtd is NULL: + return None + return funicodeOrNone(self._c_dtd.name) + + property external_id: + def __get__(self): + if self._c_dtd is NULL: + return None + return funicodeOrNone(self._c_dtd.ExternalID) + + property system_url: + def __get__(self): + if self._c_dtd is NULL: + return None + return funicodeOrNone(self._c_dtd.SystemID) + + def iterelements(self): + cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL + while c_node is not NULL: + if c_node.type == tree.XML_ELEMENT_DECL: + node = _DTDElementDecl() + node._dtd = self + node._c_node = c_node + yield node + c_node = c_node.next + + def elements(self): + return list(self.iterelements()) + + def iterentities(self): + cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL + while c_node is not NULL: + if c_node.type == tree.XML_ENTITY_DECL: + node = _DTDEntityDecl() + node._dtd = self + node._c_node = c_node + yield node + c_node = c_node.next + + def entities(self): + return list(self.iterentities()) + + def __dealloc__(self): + tree.xmlFreeDtd(self._c_dtd) + + def __call__(self, etree): + u"""__call__(self, etree) + + Validate doc using the DTD. + + Returns true if the document is valid, false if not. + """ + cdef _Document doc + cdef _Element root_node + cdef xmlDoc* c_doc + cdef dtdvalid.xmlValidCtxt* valid_ctxt + cdef int ret = -1 + + assert self._c_dtd is not NULL, "DTD not initialised" + doc = _documentOrRaise(etree) + root_node = _rootNodeOrRaise(etree) + + valid_ctxt = dtdvalid.xmlNewValidCtxt() + if valid_ctxt is NULL: + raise DTDError(u"Failed to create validation context") + + # work around error reporting bug in libxml2 <= 2.9.1 (and later?) + # https://bugzilla.gnome.org/show_bug.cgi?id=724903 + valid_ctxt.error = _nullGenericErrorFunc + valid_ctxt.userData = NULL + + try: + with self._error_log: + c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node) + ret = dtdvalid.xmlValidateDtd(valid_ctxt, c_doc, self._c_dtd) + _destroyFakeDoc(doc._c_doc, c_doc) + finally: + dtdvalid.xmlFreeValidCtxt(valid_ctxt) + + if ret == -1: + raise DTDValidateError(u"Internal error in DTD validation", + self._error_log) + return ret == 1 + + +cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL: + cdef _ExceptionContext exc_context + cdef _FileReaderContext dtd_parser + cdef _ErrorLog error_log + cdef tree.xmlDtd* c_dtd + exc_context = _ExceptionContext() + dtd_parser = _FileReaderContext(file, exc_context, None) + error_log = _ErrorLog() + + with error_log: + c_dtd = dtd_parser._readDtd() + + exc_context._raise_if_stored() + if c_dtd is NULL: + raise DTDParseError(u"error parsing DTD", error_log) + return c_dtd + +cdef DTD _dtdFactory(tree.xmlDtd* c_dtd): + # do not run through DTD.__init__()! + cdef DTD dtd + if c_dtd is NULL: + return None + dtd = DTD.__new__(DTD) + dtd._c_dtd = _copyDtd(c_dtd) + _Validator.__init__(dtd) + return dtd + + +cdef tree.xmlDtd* _copyDtd(tree.xmlDtd* c_orig_dtd) except NULL: + """ + Copy a DTD. libxml2 (currently) fails to set up the element->attributes + links when copying DTDs, so we have to rebuild them here. + """ + c_dtd = tree.xmlCopyDtd(c_orig_dtd) + if not c_dtd: + raise MemoryError + cdef tree.xmlNode* c_node = c_dtd.children + while c_node: + if c_node.type == tree.XML_ATTRIBUTE_DECL: + _linkDtdAttribute(c_dtd, c_node) + c_node = c_node.next + return c_dtd + + +cdef void _linkDtdAttribute(tree.xmlDtd* c_dtd, tree.xmlAttribute* c_attr): + """ + Create the link to the DTD attribute declaration from the corresponding + element declaration. + """ + c_elem = dtdvalid.xmlGetDtdElementDesc(c_dtd, c_attr.elem) + if not c_elem: + # no such element? something is wrong with the DTD ... + return + c_pos = c_elem.attributes + if not c_pos: + c_elem.attributes = c_attr + c_attr.nexth = NULL + return + # libxml2 keeps namespace declarations first, and we need to make + # sure we don't re-insert attributes that are already there + if _isDtdNsDecl(c_attr): + if not _isDtdNsDecl(c_pos): + c_elem.attributes = c_attr + c_attr.nexth = c_pos + return + while c_pos != c_attr and c_pos.nexth and _isDtdNsDecl(c_pos.nexth): + c_pos = c_pos.nexth + else: + # append at end + while c_pos != c_attr and c_pos.nexth: + c_pos = c_pos.nexth + if c_pos == c_attr: + return + c_attr.nexth = c_pos.nexth + c_pos.nexth = c_attr + + +cdef bint _isDtdNsDecl(tree.xmlAttribute* c_attr): + if cstring_h.strcmp(c_attr.name, "xmlns") == 0: + return True + if (c_attr.prefix is not NULL and + cstring_h.strcmp(c_attr.prefix, "xmlns") == 0): + return True + return False diff --git a/lib/lxml/extensions.pxi b/lib/lxml/extensions.pxi new file mode 100644 index 00000000..531036ef --- /dev/null +++ b/lib/lxml/extensions.pxi @@ -0,0 +1,855 @@ +# support for extension functions in XPath and XSLT + +class XPathError(LxmlError): + u"""Base class of all XPath errors. + """ + pass + +class XPathEvalError(XPathError): + u"""Error during XPath evaluation. + """ + pass + +class XPathFunctionError(XPathEvalError): + u"""Internal error looking up an XPath extension function. + """ + pass + +class XPathResultError(XPathEvalError): + u"""Error handling an XPath result. + """ + pass + +# forward declarations + +ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf) +cdef class _ExsltRegExp + +################################################################################ +# Base class for XSLT and XPath evaluation contexts: functions, namespaces, ... + +@cython.internal +cdef class _BaseContext: + cdef xpath.xmlXPathContext* _xpathCtxt + cdef _Document _doc + cdef dict _extensions + cdef list _namespaces + cdef list _global_namespaces + cdef dict _utf_refs + cdef dict _function_cache + cdef dict _eval_context_dict + cdef bint _build_smart_strings + # for exception handling and temporary reference keeping: + cdef _TempStore _temp_refs + cdef set _temp_documents + cdef _ExceptionContext _exc + cdef _ErrorLog _error_log + + def __cinit__(self): + self._xpathCtxt = NULL + + def __init__(self, namespaces, extensions, error_log, enable_regexp, + build_smart_strings): + cdef _ExsltRegExp _regexp + cdef dict new_extensions + cdef list ns + self._utf_refs = {} + self._global_namespaces = [] + self._function_cache = {} + self._eval_context_dict = None + self._error_log = error_log + + if extensions is not None: + # convert extensions to UTF-8 + if isinstance(extensions, dict): + extensions = (extensions,) + # format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function} + new_extensions = {} + for extension in extensions: + for (ns_uri, name), function in extension.items(): + if name is None: + raise ValueError, u"extensions must have non empty names" + ns_utf = self._to_utf(ns_uri) + name_utf = self._to_utf(name) + new_extensions[(ns_utf, name_utf)] = function + extensions = new_extensions or None + + if namespaces is not None: + if isinstance(namespaces, dict): + namespaces = namespaces.items() + if namespaces: + ns = [] + for prefix, ns_uri in namespaces: + if prefix is None or not prefix: + raise TypeError, \ + u"empty namespace prefix is not supported in XPath" + if ns_uri is None or not ns_uri: + raise TypeError, \ + u"setting default namespace is not supported in XPath" + prefix_utf = self._to_utf(prefix) + ns_uri_utf = self._to_utf(ns_uri) + ns.append( (prefix_utf, ns_uri_utf) ) + namespaces = ns + else: + namespaces = None + + self._doc = None + self._exc = _ExceptionContext() + self._extensions = extensions + self._namespaces = namespaces + self._temp_refs = _TempStore() + self._temp_documents = set() + self._build_smart_strings = build_smart_strings + + if enable_regexp: + _regexp = _ExsltRegExp() + _regexp._register_in_context(self) + + cdef _BaseContext _copy(self): + cdef _BaseContext context + if self._namespaces is not None: + namespaces = self._namespaces[:] + else: + namespaces = None + context = self.__class__(namespaces, None, self._error_log, False, + self._build_smart_strings) + if self._extensions is not None: + context._extensions = self._extensions.copy() + return context + + cdef bytes _to_utf(self, s): + u"Convert to UTF-8 and keep a reference to the encoded string" + cdef python.PyObject* dict_result + if s is None: + return None + dict_result = python.PyDict_GetItem(self._utf_refs, s) + if dict_result is not NULL: + return dict_result + utf = _utf8(s) + self._utf_refs[s] = utf + if python.IS_PYPY: + # use C level refs, PyPy refs are not enough! + python.Py_INCREF(utf) + return utf + + cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt): + self._xpathCtxt = xpathCtxt + xpathCtxt.userData = self + xpathCtxt.error = _receiveXPathError + + @cython.final + cdef _register_context(self, _Document doc): + self._doc = doc + self._exc.clear() + + @cython.final + cdef _cleanup_context(self): + #xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt) + #self.unregisterGlobalNamespaces() + if python.IS_PYPY: + # clean up double refs in PyPy (see "_to_utf()" method) + for ref in self._utf_refs.itervalues(): + python.Py_DECREF(ref) + self._utf_refs.clear() + self._eval_context_dict = None + self._doc = None + + @cython.final + cdef _release_context(self): + if self._xpathCtxt is not NULL: + self._xpathCtxt.userData = NULL + self._xpathCtxt = NULL + + # namespaces (internal UTF-8 methods with leading '_') + + cdef addNamespace(self, prefix, ns_uri): + cdef list namespaces + if prefix is None: + raise TypeError, u"empty prefix is not supported in XPath" + prefix_utf = self._to_utf(prefix) + ns_uri_utf = self._to_utf(ns_uri) + new_item = (prefix_utf, ns_uri_utf) + if self._namespaces is None: + self._namespaces = [new_item] + else: + namespaces = [] + for item in self._namespaces: + if item[0] == prefix_utf: + item = new_item + new_item = None + namespaces.append(item) + if new_item is not None: + namespaces.append(new_item) + self._namespaces = namespaces + if self._xpathCtxt is not NULL: + xpath.xmlXPathRegisterNs( + self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf)) + + cdef registerNamespace(self, prefix, ns_uri): + if prefix is None: + raise TypeError, u"empty prefix is not supported in XPath" + prefix_utf = self._to_utf(prefix) + ns_uri_utf = self._to_utf(ns_uri) + self._global_namespaces.append(prefix_utf) + xpath.xmlXPathRegisterNs(self._xpathCtxt, + _xcstr(prefix_utf), _xcstr(ns_uri_utf)) + + cdef registerLocalNamespaces(self): + if self._namespaces is None: + return + for prefix_utf, ns_uri_utf in self._namespaces: + xpath.xmlXPathRegisterNs( + self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf)) + + cdef registerGlobalNamespaces(self): + cdef list ns_prefixes = _find_all_extension_prefixes() + if python.PyList_GET_SIZE(ns_prefixes) > 0: + for prefix_utf, ns_uri_utf in ns_prefixes: + self._global_namespaces.append(prefix_utf) + xpath.xmlXPathRegisterNs( + self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf)) + + cdef unregisterGlobalNamespaces(self): + if python.PyList_GET_SIZE(self._global_namespaces) > 0: + for prefix_utf in self._global_namespaces: + xpath.xmlXPathRegisterNs(self._xpathCtxt, + _xcstr(prefix_utf), NULL) + del self._global_namespaces[:] + + cdef void _unregisterNamespace(self, prefix_utf): + xpath.xmlXPathRegisterNs(self._xpathCtxt, + _xcstr(prefix_utf), NULL) + + # extension functions + + cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1: + if self._extensions is None: + self._extensions = {} + self._extensions[(ns_utf, name_utf)] = function + return 0 + + cdef registerGlobalFunctions(self, void* ctxt, + _register_function reg_func): + cdef python.PyObject* dict_result + cdef dict d + for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems(): + dict_result = python.PyDict_GetItem( + self._function_cache, ns_utf) + if dict_result is not NULL: + d = dict_result + else: + d = {} + self._function_cache[ns_utf] = d + for name_utf, function in ns_functions.iteritems(): + d[name_utf] = function + reg_func(ctxt, name_utf, ns_utf) + + cdef registerLocalFunctions(self, void* ctxt, + _register_function reg_func): + cdef python.PyObject* dict_result + cdef dict d + if self._extensions is None: + return # done + last_ns = None + d = None + for (ns_utf, name_utf), function in self._extensions.iteritems(): + if ns_utf is not last_ns or d is None: + last_ns = ns_utf + dict_result = python.PyDict_GetItem( + self._function_cache, ns_utf) + if dict_result is not NULL: + d = dict_result + else: + d = {} + self._function_cache[ns_utf] = d + d[name_utf] = function + reg_func(ctxt, name_utf, ns_utf) + + cdef unregisterAllFunctions(self, void* ctxt, + _register_function unreg_func): + for ns_utf, functions in self._function_cache.iteritems(): + for name_utf in functions: + unreg_func(ctxt, name_utf, ns_utf) + + cdef unregisterGlobalFunctions(self, void* ctxt, + _register_function unreg_func): + for ns_utf, functions in self._function_cache.items(): + for name_utf in functions: + if self._extensions is None or \ + (ns_utf, name_utf) not in self._extensions: + unreg_func(ctxt, name_utf, ns_utf) + + @cython.final + cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name): + u"""Lookup an extension function in the cache and return it. + + Parameters: c_ns_uri may be NULL, c_name must not be NULL + """ + cdef python.PyObject* c_dict + cdef python.PyObject* dict_result + c_dict = python.PyDict_GetItem( + self._function_cache, None if c_ns_uri is NULL else c_ns_uri) + if c_dict is not NULL: + dict_result = python.PyDict_GetItem( + c_dict, c_name) + if dict_result is not NULL: + return dict_result + return None + + # Python access to the XPath context for extension functions + + property context_node: + def __get__(self): + cdef xmlNode* c_node + if self._xpathCtxt is NULL: + raise XPathError, \ + u"XPath context is only usable during the evaluation" + c_node = self._xpathCtxt.node + if c_node is NULL: + raise XPathError, u"no context node" + if c_node.doc != self._xpathCtxt.doc: + raise XPathError, \ + u"document-external context nodes are not supported" + if self._doc is None: + raise XPathError, u"document context is missing" + return _elementFactory(self._doc, c_node) + + property eval_context: + def __get__(self): + if self._eval_context_dict is None: + self._eval_context_dict = {} + return self._eval_context_dict + + # Python reference keeping during XPath function evaluation + + @cython.final + cdef _release_temp_refs(self): + u"Free temporarily referenced objects from this context." + self._temp_refs.clear() + self._temp_documents.clear() + + @cython.final + cdef _hold(self, obj): + u"""A way to temporarily hold references to nodes in the evaluator. + + This is needed because otherwise nodes created in XPath extension + functions would be reference counted too soon, during the XPath + evaluation. This is most important in the case of exceptions. + """ + cdef _Element element + if isinstance(obj, _Element): + self._temp_refs.add(obj) + self._temp_documents.add((<_Element>obj)._doc) + return + elif _isString(obj) or not python.PySequence_Check(obj): + return + for o in obj: + if isinstance(o, _Element): + #print "Holding element:", element._c_node + self._temp_refs.add(o) + #print "Holding document:", element._doc._c_doc + self._temp_documents.add((<_Element>o)._doc) + + @cython.final + cdef _Document _findDocumentForNode(self, xmlNode* c_node): + u"""If an XPath expression returns an element from a different + document than the current context document, we call this to + see if it was possibly created by an extension and is a known + document instance. + """ + cdef _Document doc + for doc in self._temp_documents: + if doc is not None and doc._c_doc is c_node.doc: + return doc + return None + + +# libxml2 keeps these error messages in a static array in its code +# and doesn't give us access to them ... + +cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = ( + b"Ok", + b"Number encoding", + b"Unfinished literal", + b"Start of literal", + b"Expected $ for variable reference", + b"Undefined variable", + b"Invalid predicate", + b"Invalid expression", + b"Missing closing curly brace", + b"Unregistered function", + b"Invalid operand", + b"Invalid type", + b"Invalid number of arguments", + b"Invalid context size", + b"Invalid context position", + b"Memory allocation error", + b"Syntax error", + b"Resource error", + b"Sub resource error", + b"Undefined namespace prefix", + b"Encoding error", + b"Char out of XML range", + b"Invalid or incomplete context", + b"Stack usage error", +) + +cdef void _forwardXPathError(void* c_ctxt, xmlerror.xmlError* c_error) with gil: + cdef xmlerror.xmlError error + cdef int xpath_code + if c_error.message is not NULL: + error.message = c_error.message + else: + xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK + if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES): + error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code]) + else: + error.message = b"unknown error" + error.domain = c_error.domain + error.code = c_error.code + error.level = c_error.level + error.line = c_error.line + error.int2 = c_error.int1 # column + error.file = c_error.file + + (<_BaseContext>c_ctxt)._error_log._receive(&error) + +cdef void _receiveXPathError(void* c_context, xmlerror.xmlError* error) nogil: + if not __DEBUG: + return + if c_context is NULL: + _forwardError(NULL, error) + else: + _forwardXPathError(c_context, error) + + +def Extension(module, function_mapping=None, *, ns=None): + u"""Extension(module, function_mapping=None, ns=None) + + Build a dictionary of extension functions from the functions + defined in a module or the methods of an object. + + As second argument, you can pass an additional mapping of + attribute names to XPath function names, or a list of function + names that should be taken. + + The ``ns`` keyword argument accepts a namespace URI for the XPath + functions. + """ + cdef dict functions = {} + if isinstance(function_mapping, dict): + for function_name, xpath_name in function_mapping.items(): + functions[(ns, xpath_name)] = getattr(module, function_name) + else: + if function_mapping is None: + function_mapping = [ name for name in dir(module) + if not name.startswith(u'_') ] + for function_name in function_mapping: + functions[(ns, function_name)] = getattr(module, function_name) + return functions + +################################################################################ +# EXSLT regexp implementation + +@cython.final +@cython.internal +cdef class _ExsltRegExp: + cdef dict _compile_map + def __cinit__(self): + self._compile_map = {} + + cdef _make_string(self, value): + if _isString(value): + return value + elif isinstance(value, list): + # node set: take recursive text concatenation of first element + if python.PyList_GET_SIZE(value) == 0: + return u'' + firstnode = value[0] + if _isString(firstnode): + return firstnode + elif isinstance(firstnode, _Element): + c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node) + if c_text is NULL: + raise MemoryError() + try: + return funicode(c_text) + finally: + tree.xmlFree(c_text) + else: + return unicode(firstnode) + else: + return unicode(value) + + cdef _compile(self, rexp, ignore_case): + cdef python.PyObject* c_result + rexp = self._make_string(rexp) + key = (rexp, ignore_case) + c_result = python.PyDict_GetItem(self._compile_map, key) + if c_result is not NULL: + return c_result + py_flags = re.UNICODE + if ignore_case: + py_flags = py_flags | re.IGNORECASE + rexp_compiled = re.compile(rexp, py_flags) + self._compile_map[key] = rexp_compiled + return rexp_compiled + + def test(self, ctxt, s, rexp, flags=u''): + flags = self._make_string(flags) + s = self._make_string(s) + rexpc = self._compile(rexp, u'i' in flags) + if rexpc.search(s) is None: + return False + else: + return True + + def match(self, ctxt, s, rexp, flags=u''): + cdef list result_list + flags = self._make_string(flags) + s = self._make_string(s) + rexpc = self._compile(rexp, u'i' in flags) + if u'g' in flags: + results = rexpc.findall(s) + if not results: + return () + else: + result = rexpc.search(s) + if not result: + return () + results = [ result.group() ] + results.extend( result.groups(u'') ) + result_list = [] + root = Element(u'matches') + join_groups = u''.join + for s_match in results: + if python.PyTuple_CheckExact(s_match): + s_match = join_groups(s_match) + elem = SubElement(root, u'match') + elem.text = s_match + result_list.append(elem) + return result_list + + def replace(self, ctxt, s, rexp, flags, replacement): + replacement = self._make_string(replacement) + flags = self._make_string(flags) + s = self._make_string(s) + rexpc = self._compile(rexp, u'i' in flags) + if u'g' in flags: + count = 0 + else: + count = 1 + return rexpc.sub(replacement, s, count) + + cdef _register_in_context(self, _BaseContext context): + ns = b"http://exslt.org/regular-expressions" + context._addLocalExtensionFunction(ns, b"test", self.test) + context._addLocalExtensionFunction(ns, b"match", self.match) + context._addLocalExtensionFunction(ns, b"replace", self.replace) + + +################################################################################ +# helper functions + +cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc, + _BaseContext context) except NULL: + cdef xpath.xmlNodeSet* resultSet + cdef _Element fake_node = None + cdef xmlNode* c_node + + if isinstance(obj, unicode): + obj = _utf8(obj) + if isinstance(obj, bytes): + # libxml2 copies the string value + return xpath.xmlXPathNewCString(_cstr(obj)) + if isinstance(obj, bool): + return xpath.xmlXPathNewBoolean(obj) + if python.PyNumber_Check(obj): + return xpath.xmlXPathNewFloat(obj) + if obj is None: + resultSet = xpath.xmlXPathNodeSetCreate(NULL) + elif isinstance(obj, _Element): + resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node) + elif python.PySequence_Check(obj): + resultSet = xpath.xmlXPathNodeSetCreate(NULL) + try: + for value in obj: + if isinstance(value, _Element): + if context is not None: + context._hold(value) + xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node) + else: + if context is None or doc is None: + raise XPathResultError, \ + u"Non-Element values not supported at this point - got %r" % value + # support strings by appending text nodes to an Element + if isinstance(value, unicode): + value = _utf8(value) + if isinstance(value, bytes): + if fake_node is None: + fake_node = _makeElement("text-root", NULL, doc, None, + None, None, None, None, None) + context._hold(fake_node) + else: + # append a comment node to keep the text nodes separate + c_node = tree.xmlNewDocComment(doc._c_doc, "") + if c_node is NULL: + raise MemoryError() + tree.xmlAddChild(fake_node._c_node, c_node) + context._hold(value) + c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value)) + if c_node is NULL: + raise MemoryError() + tree.xmlAddChild(fake_node._c_node, c_node) + xpath.xmlXPathNodeSetAdd(resultSet, c_node) + else: + raise XPathResultError, \ + u"This is not a supported node-set result: %r" % value + except: + xpath.xmlXPathFreeNodeSet(resultSet) + raise + else: + raise XPathResultError, u"Unknown return type: %s" % \ + python._fqtypename(obj).decode('utf8') + return xpath.xmlXPathWrapNodeSet(resultSet) + +cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj, + _Document doc, _BaseContext context): + if xpathObj.type == xpath.XPATH_UNDEFINED: + raise XPathResultError, u"Undefined xpath result" + elif xpathObj.type == xpath.XPATH_NODESET: + return _createNodeSetResult(xpathObj, doc, context) + elif xpathObj.type == xpath.XPATH_BOOLEAN: + return xpathObj.boolval + elif xpathObj.type == xpath.XPATH_NUMBER: + return xpathObj.floatval + elif xpathObj.type == xpath.XPATH_STRING: + stringval = funicode(xpathObj.stringval) + if context._build_smart_strings: + stringval = _elementStringResultFactory( + stringval, None, None, 0) + return stringval + elif xpathObj.type == xpath.XPATH_POINT: + raise NotImplementedError, u"XPATH_POINT" + elif xpathObj.type == xpath.XPATH_RANGE: + raise NotImplementedError, u"XPATH_RANGE" + elif xpathObj.type == xpath.XPATH_LOCATIONSET: + raise NotImplementedError, u"XPATH_LOCATIONSET" + elif xpathObj.type == xpath.XPATH_USERS: + raise NotImplementedError, u"XPATH_USERS" + elif xpathObj.type == xpath.XPATH_XSLT_TREE: + return _createNodeSetResult(xpathObj, doc, context) + else: + raise XPathResultError, u"Unknown xpath result %s" % unicode(xpathObj.type) + +cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc, + _BaseContext context): + cdef xmlNode* c_node + cdef int i + cdef list result + result = [] + if xpathObj.nodesetval is NULL: + return result + for i in range(xpathObj.nodesetval.nodeNr): + c_node = xpathObj.nodesetval.nodeTab[i] + _unpackNodeSetEntry(result, c_node, doc, context, + xpathObj.type == xpath.XPATH_XSLT_TREE) + return result + +cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc, + _BaseContext context, bint is_fragment): + cdef xmlNode* c_child + if _isElement(c_node): + if c_node.doc != doc._c_doc and c_node.doc._private is NULL: + # XXX: works, but maybe not always the right thing to do? + # XPath: only runs when extensions create or copy trees + # -> we store Python refs to these, so that is OK + # XSLT: can it leak when merging trees from multiple sources? + c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) + # FIXME: call _instantiateElementFromXPath() instead? + results.append( + _fakeDocElementFactory(doc, c_node)) + elif c_node.type == tree.XML_TEXT_NODE or \ + c_node.type == tree.XML_CDATA_SECTION_NODE or \ + c_node.type == tree.XML_ATTRIBUTE_NODE: + results.append( + _buildElementStringResult(doc, c_node, context)) + elif c_node.type == tree.XML_NAMESPACE_DECL: + results.append( (funicodeOrNone((c_node).prefix), + funicodeOrNone((c_node).href)) ) + elif c_node.type == tree.XML_DOCUMENT_NODE or \ + c_node.type == tree.XML_HTML_DOCUMENT_NODE: + # ignored for everything but result tree fragments + if is_fragment: + c_child = c_node.children + while c_child is not NULL: + _unpackNodeSetEntry(results, c_child, doc, context, 0) + c_child = c_child.next + elif c_node.type == tree.XML_XINCLUDE_START or \ + c_node.type == tree.XML_XINCLUDE_END: + pass + else: + raise NotImplementedError, \ + u"Not yet implemented result node type: %d" % c_node.type + +cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj): + u"""Free the XPath object, but *never* free the *content* of node sets. + Python dealloc will do that for us. + """ + if xpathObj.nodesetval is not NULL: + xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval) + xpathObj.nodesetval = NULL + xpath.xmlXPathFreeObject(xpathObj) + +cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc, + _BaseContext context): + # NOTE: this may copy the element - only call this when it can't leak + if c_node.doc != doc._c_doc and c_node.doc._private is NULL: + # not from the context document and not from a fake document + # either => may still be from a known document, e.g. one + # created by an extension function + doc = context._findDocumentForNode(c_node) + if doc is None: + # not from a known document at all! => can only make a + # safety copy here + c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1) + return _fakeDocElementFactory(doc, c_node) + +################################################################################ +# special str/unicode subclasses + +@cython.final +cdef class _ElementUnicodeResult(unicode): + cdef _Element _parent + cdef readonly object attrname + cdef readonly bint is_tail + cdef readonly bint is_text + cdef readonly bint is_attribute + + def getparent(self): + return self._parent + +class _ElementStringResult(bytes): + # we need to use a Python class here, bytes cannot be C-subclassed + # in Pyrex/Cython + def getparent(self): + return self._parent + +cdef object _elementStringResultFactory(string_value, _Element parent, + attrname, bint is_tail): + cdef _ElementUnicodeResult uresult + cdef bint is_text + cdef bint is_attribute = attrname is not None + if parent is None: + is_text = 0 + else: + is_text = not (is_tail or is_attribute) + + if type(string_value) is bytes: + result = _ElementStringResult(string_value) + result._parent = parent + result.is_attribute = is_attribute + result.is_tail = is_tail + result.is_text = is_text + result.attrname = attrname + return result + else: + uresult = _ElementUnicodeResult(string_value) + uresult._parent = parent + uresult.is_attribute = is_attribute + uresult.is_tail = is_tail + uresult.is_text = is_text + uresult.attrname = attrname + return uresult + +cdef object _buildElementStringResult(_Document doc, xmlNode* c_node, + _BaseContext context): + cdef _Element parent = None + cdef object attrname = None + cdef xmlNode* c_element + cdef bint is_tail + + if c_node.type == tree.XML_ATTRIBUTE_NODE: + attrname = _namespacedName(c_node) + is_tail = 0 + s = tree.xmlNodeGetContent(c_node) + try: + value = funicode(s) + finally: + tree.xmlFree(s) + c_element = NULL + else: + #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type" + # may be tail text or normal text + value = funicode(c_node.content) + c_element = _previousElement(c_node) + is_tail = c_element is not NULL + + if not context._build_smart_strings: + return value + + if c_element is NULL: + # non-tail text or attribute text + c_element = c_node.parent + while c_element is not NULL and not _isElement(c_element): + c_element = c_element.parent + + if c_element is not NULL: + parent = _instantiateElementFromXPath(c_element, doc, context) + + return _elementStringResultFactory( + value, parent, attrname, is_tail) + +################################################################################ +# callbacks for XPath/XSLT extension functions + +cdef void _extension_function_call(_BaseContext context, function, + xpath.xmlXPathParserContext* ctxt, int nargs): + cdef _Document doc + cdef xpath.xmlXPathObject* obj + cdef list args + cdef int i + doc = context._doc + try: + args = [] + for i in range(nargs): + obj = xpath.valuePop(ctxt) + o = _unwrapXPathObject(obj, doc, context) + _freeXPathObject(obj) + args.append(o) + args.reverse() + + res = function(context, *args) + # wrap result for XPath consumption + obj = _wrapXPathObject(res, doc, context) + # prevent Python from deallocating elements handed to libxml2 + context._hold(res) + xpath.valuePush(ctxt, obj) + except: + xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR) + context._exc._store_raised() + finally: + return # swallow any further exceptions + +# lookup the function by name and call it + +cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt, + int nargs) with gil: + cdef _BaseContext context + cdef xpath.xmlXPathContext* rctxt = ctxt.context + context = <_BaseContext> rctxt.userData + try: + function = context._find_cached_function(rctxt.functionURI, rctxt.function) + if function is not None: + _extension_function_call(context, function, ctxt, nargs) + else: + xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR) + context._exc._store_exception( + XPathFunctionError(u"XPath function '%s' not found" % + _namespacedNameFromNsName(rctxt.functionURI, rctxt.function))) + except: + # may not be the right error, but we need to tell libxml2 *something* + xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR) + context._exc._store_raised() + finally: + return # swallow any further exceptions diff --git a/lib/lxml/html/ElementSoup.py b/lib/lxml/html/ElementSoup.py new file mode 100644 index 00000000..8e4fde13 --- /dev/null +++ b/lib/lxml/html/ElementSoup.py @@ -0,0 +1,10 @@ +__doc__ = """Legacy interface to the BeautifulSoup HTML parser. +""" + +__all__ = ["parse", "convert_tree"] + +from soupparser import convert_tree, parse as _parse + +def parse(file, beautifulsoup=None, makeelement=None): + root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement) + return root.getroot() diff --git a/lib/lxml/html/__init__.py b/lib/lxml/html/__init__.py new file mode 100644 index 00000000..fe28c3bb --- /dev/null +++ b/lib/lxml/html/__init__.py @@ -0,0 +1,1697 @@ +# Copyright (c) 2004 Ian Bicking. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. Neither the name of Ian Bicking nor the names of its contributors may +# be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""The ``lxml.html`` tool set for HTML handling. +""" + +import sys +import re +try: + from urlparse import urljoin +except ImportError: + # Python 3 + from urllib.parse import urljoin +import copy +from lxml import etree +from lxml.html import defs +from lxml.html._setmixin import SetMixin +try: + from collections import MutableMapping as DictMixin +except ImportError: + # Python < 2.6 + from UserDict import DictMixin +try: + set +except NameError: + # Python 2.3 + from sets import Set as set +try: + bytes +except NameError: + # Python < 2.6 + bytes = str +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + basestring +except NameError: + # Python 3 + basestring = (str, bytes) + +def __fix_docstring(s): + if not s: + return s + import sys + if sys.version_info[0] >= 3: + sub = re.compile(r"^(\s*)u'", re.M).sub + else: + sub = re.compile(r"^(\s*)b'", re.M).sub + return sub(r"\1'", s) + +__all__ = [ + 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', + 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', + 'find_rel_links', 'find_class', 'make_links_absolute', + 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] + +XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" + +_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", + namespaces={'x':XHTML_NAMESPACE}) +_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", + namespaces={'x':XHTML_NAMESPACE}) +_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", + namespaces={'x':XHTML_NAMESPACE}) +#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) +_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") +_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") +_collect_string_content = etree.XPath("string()") +_css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) +_css_import_re = re.compile(r'@import "(.*?)"') +_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", + namespaces={'x':XHTML_NAMESPACE}) +_archive_re = re.compile(r'[^ ]+') + +def _unquote_match(s, pos): + if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": + return s[1:-1], pos+1 + else: + return s,pos + +def _transform_result(typ, result): + """Convert the result back into the input type. + """ + if issubclass(typ, bytes): + return tostring(result, encoding='utf-8') + elif issubclass(typ, unicode): + return tostring(result, encoding='unicode') + else: + return result + +def _nons(tag): + if isinstance(tag, basestring): + if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: + return tag.split('}')[-1] + return tag + +class HtmlMixin(object): + + def base_url(self): + """ + Returns the base URL, given when the page was parsed. + + Use with ``urlparse.urljoin(el.base_url, href)`` to get + absolute URLs. + """ + return self.getroottree().docinfo.URL + base_url = property(base_url, doc=base_url.__doc__) + + def forms(self): + """ + Return a list of all the forms + """ + return _forms_xpath(self) + forms = property(forms, doc=forms.__doc__) + + def body(self): + """ + Return the element. Can be called from a child element + to get the document's head. + """ + return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] + body = property(body, doc=body.__doc__) + + def head(self): + """ + Returns the element. Can be called from a child + element to get the document's head. + """ + return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] + head = property(head, doc=head.__doc__) + + def _label__get(self): + """ + Get or set any