mirror of
https://github.com/moparisthebest/SickRage
synced 2025-01-08 04:18:09 -05:00
211 lines
7.8 KiB
Cython
211 lines
7.8 KiB
Cython
# functions for tree cleanup and removing elements from subtrees
|
|
|
|
def cleanup_namespaces(tree_or_element):
|
|
u"""cleanup_namespaces(tree_or_element)
|
|
|
|
Remove all namespace declarations from a subtree that are not used
|
|
by any of the elements or attributes in that tree.
|
|
"""
|
|
cdef _Element element
|
|
element = _rootNodeOrRaise(tree_or_element)
|
|
_removeUnusedNamespaceDeclarations(element._c_node)
|
|
|
|
def strip_attributes(tree_or_element, *attribute_names):
|
|
u"""strip_attributes(tree_or_element, *attribute_names)
|
|
|
|
Delete all attributes with the provided attribute names from an
|
|
Element (or ElementTree) and its descendants.
|
|
|
|
Attribute names can contain wildcards as in `_Element.iter`.
|
|
|
|
Example usage::
|
|
|
|
strip_attributes(root_element,
|
|
'simpleattr',
|
|
'{http://some/ns}attrname',
|
|
'{http://other/ns}*')
|
|
"""
|
|
cdef _MultiTagMatcher matcher
|
|
cdef _Element element
|
|
|
|
element = _rootNodeOrRaise(tree_or_element)
|
|
if not attribute_names:
|
|
return
|
|
|
|
matcher = _MultiTagMatcher(attribute_names)
|
|
matcher.cacheTags(element._doc)
|
|
if matcher.rejectsAllAttributes():
|
|
return
|
|
_strip_attributes(element._c_node, matcher)
|
|
|
|
cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
|
|
cdef xmlAttr* c_attr
|
|
cdef xmlAttr* c_next_attr
|
|
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
|
if c_node.type == tree.XML_ELEMENT_NODE:
|
|
c_attr = c_node.properties
|
|
while c_attr is not NULL:
|
|
c_next_attr = c_attr.next
|
|
if matcher.matchesAttribute(c_attr):
|
|
tree.xmlRemoveProp(c_attr)
|
|
c_attr = c_next_attr
|
|
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
|
|
|
def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
|
|
u"""strip_elements(tree_or_element, *tag_names, with_tail=True)
|
|
|
|
Delete all elements with the provided tag names from a tree or
|
|
subtree. This will remove the elements and their entire subtree,
|
|
including all their attributes, text content and descendants. It
|
|
will also remove the tail text of the element unless you
|
|
explicitly set the ``with_tail`` keyword argument option to False.
|
|
|
|
Tag names can contain wildcards as in `_Element.iter`.
|
|
|
|
Note that this will not delete the element (or ElementTree root
|
|
element) that you passed even if it matches. It will only treat
|
|
its descendants. If you want to include the root element, check
|
|
its tag name directly before even calling this function.
|
|
|
|
Example usage::
|
|
|
|
strip_elements(some_element,
|
|
'simpletagname', # non-namespaced tag
|
|
'{http://some/ns}tagname', # namespaced tag
|
|
'{http://some/other/ns}*' # any tag from a namespace
|
|
lxml.etree.Comment # comments
|
|
)
|
|
"""
|
|
cdef _MultiTagMatcher matcher
|
|
cdef _Element element
|
|
cdef _Document doc
|
|
cdef list ns_tags
|
|
cdef qname* c_ns_tags
|
|
cdef Py_ssize_t c_tag_count
|
|
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
|
|
|
|
doc = _documentOrRaise(tree_or_element)
|
|
element = _rootNodeOrRaise(tree_or_element)
|
|
if not tag_names:
|
|
return
|
|
|
|
matcher = _MultiTagMatcher(tag_names)
|
|
matcher.cacheTags(doc)
|
|
if matcher.rejectsAll():
|
|
return
|
|
|
|
if isinstance(tree_or_element, _ElementTree):
|
|
# include PIs and comments next to the root node
|
|
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
|
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
|
|
if matcher.matchesType(tree.XML_PI_NODE):
|
|
_removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
|
|
_strip_elements(doc, element._c_node, matcher, with_tail)
|
|
|
|
cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
|
|
bint with_tail):
|
|
cdef xmlNode* c_child
|
|
cdef xmlNode* c_next
|
|
|
|
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
|
if c_node.type == tree.XML_ELEMENT_NODE:
|
|
# we run through the children here to prevent any problems
|
|
# with the tree iteration which would occur if we unlinked the
|
|
# c_node itself
|
|
c_child = _findChildForwards(c_node, 0)
|
|
while c_child is not NULL:
|
|
c_next = _nextElement(c_child)
|
|
if matcher.matches(c_child):
|
|
if c_child.type == tree.XML_ELEMENT_NODE:
|
|
if not with_tail:
|
|
tree.xmlUnlinkNode(c_child)
|
|
_removeNode(doc, c_child)
|
|
else:
|
|
if with_tail:
|
|
_removeText(c_child.next)
|
|
tree.xmlUnlinkNode(c_child)
|
|
attemptDeallocation(c_child)
|
|
c_child = c_next
|
|
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|
|
|
|
|
|
def strip_tags(tree_or_element, *tag_names):
|
|
u"""strip_tags(tree_or_element, *tag_names)
|
|
|
|
Delete all elements with the provided tag names from a tree or
|
|
subtree. This will remove the elements and their attributes, but
|
|
*not* their text/tail content or descendants. Instead, it will
|
|
merge the text content and children of the element into its
|
|
parent.
|
|
|
|
Tag names can contain wildcards as in `_Element.iter`.
|
|
|
|
Note that this will not delete the element (or ElementTree root
|
|
element) that you passed even if it matches. It will only treat
|
|
its descendants.
|
|
|
|
Example usage::
|
|
|
|
strip_tags(some_element,
|
|
'simpletagname', # non-namespaced tag
|
|
'{http://some/ns}tagname', # namespaced tag
|
|
'{http://some/other/ns}*' # any tag from a namespace
|
|
Comment # comments (including their text!)
|
|
)
|
|
"""
|
|
cdef _MultiTagMatcher matcher
|
|
cdef _Element element
|
|
cdef _Document doc
|
|
cdef list ns_tags
|
|
cdef bint strip_comments = 0, strip_pis = 0, strip_entities = 0
|
|
cdef char** c_ns_tags
|
|
cdef Py_ssize_t c_tag_count
|
|
|
|
doc = _documentOrRaise(tree_or_element)
|
|
element = _rootNodeOrRaise(tree_or_element)
|
|
if not tag_names:
|
|
return
|
|
|
|
matcher = _MultiTagMatcher(tag_names)
|
|
matcher.cacheTags(doc)
|
|
if matcher.rejectsAll():
|
|
return
|
|
|
|
if isinstance(tree_or_element, _ElementTree):
|
|
# include PIs and comments next to the root node
|
|
if matcher.matchesType(tree.XML_COMMENT_NODE):
|
|
_removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
|
|
if matcher.matchesType(tree.XML_PI_NODE):
|
|
_removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
|
|
_strip_tags(doc, element._c_node, matcher)
|
|
|
|
cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
|
|
cdef xmlNode* c_child
|
|
cdef xmlNode* c_next
|
|
cdef Py_ssize_t i
|
|
|
|
tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
|
|
if c_node.type == tree.XML_ELEMENT_NODE:
|
|
# we run through the children here to prevent any problems
|
|
# with the tree iteration which would occur if we unlinked the
|
|
# c_node itself
|
|
c_child = _findChildForwards(c_node, 0)
|
|
while c_child is not NULL:
|
|
if not matcher.matches(c_child):
|
|
c_child = _nextElement(c_child)
|
|
continue
|
|
if c_child.type == tree.XML_ELEMENT_NODE:
|
|
c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
|
|
_replaceNodeByChildren(doc, c_child)
|
|
if not attemptDeallocation(c_child):
|
|
if c_child.nsDef is not NULL:
|
|
# make namespaces absolute
|
|
moveNodeToDocument(doc, doc._c_doc, c_child)
|
|
c_child = c_next
|
|
else:
|
|
c_next = _nextElement(c_child)
|
|
tree.xmlUnlinkNode(c_child)
|
|
attemptDeallocation(c_child)
|
|
c_child = c_next
|
|
tree.END_FOR_EACH_ELEMENT_FROM(c_node)
|