# -*- coding: utf-8 -*- """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import ( CData, Comment, Doctype, NavigableString, SoupStrainer, Tag, ) from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag['id'] for tag in tags], should_match) class TestFind(TreeTest): """Basic tests of the find() method. find() just calls find_all() with limit=1, so it's not tested all that thouroughly here. """ def test_find_tag(self): soup = self.soup("1234") self.assertEqual(soup.find("b").string, "2") def test_unicode_text_find(self): soup = self.soup(u'
tag are empty-element, just because
# they have no contents.
self.assertEqual(b"
", xml_br.encode())
self.assertEqual(b"
Don't leave me here.
Don\'t leave!
""" soup = self.soup(doc) second_para = soup.find(id='2') bold = soup.b # Move the tag to the end of the second paragraph. soup.find(id='2').append(soup.b) # The tag is now a child of the second paragraph. self.assertEqual(bold.parent, second_para) self.assertEqual( soup.decode(), self.document_for( 'Don\'t leave me .
\n' 'Don\'t leave!here
')) def test_replace_with_returns_thing_that_was_replaced(self): text = "onethree
') a = soup.a b = a.contents[0] # Make it so the tag has two text children. a.insert(1, "two") # Now replace each one with the empty string. left, right = a.contents left.replaceWith('') right.replaceWith('') # The tag is still connected to the tree. self.assertEqual("three", soup.b.string) def test_replace_final_node(self): soup = self.soup("Argh!") soup.find(text="Argh!").replace_with("Hooray!") new_text = soup.find(text="Hooray!") b = soup.b self.assertEqual(new_text.previous_element, b) self.assertEqual(new_text.parent, b) self.assertEqual(new_text.previous_element.next_element, new_text) self.assertEqual(new_text.next_element, None) def test_consecutive_text_nodes(self): # A builder should never create two consecutive text nodes, # but if you insert one next to another, Beautiful Soup will # handle it correctly. soup = self.soup("Argh!There's no business like show business
") no, show = soup.find_all('b') show.replace_with(no) self.assertEqual( soup.decode(), self.document_for( "There's business like no business
")) self.assertEqual(show.parent, None) self.assertEqual(no.parent, soup.p) self.assertEqual(no.next_element, "no") self.assertEqual(no.next_sibling, " business") def test_replace_first_child(self): data = "Unneeded formatting is unneeded
""") tree.em.unwrap() self.assertEqual(tree.em, None) self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") def test_wrap(self): soup = self.soup("I wish I was bold.") value = soup.string.wrap(soup.new_tag("b")) self.assertEqual(value.decode(), "I wish I was bold.") self.assertEqual( soup.decode(), self.document_for("I wish I was bold.")) def test_wrap_extracts_tag_from_elsewhere(self): soup = self.soup("I wish I was bold.") soup.b.next_sibling.wrap(soup.b) self.assertEqual( soup.decode(), self.document_for("I wish I was bold.")) def test_wrap_puts_new_contents_at_the_end(self): soup = self.soup("I like being bold.I wish I was bold.") soup.b.next_sibling.wrap(soup.b) self.assertEqual(2, len(soup.b.contents)) self.assertEqual( soup.decode(), self.document_for( "I like being bold.I wish I was bold.")) def test_extract(self): soup = self.soup( 'Some content. More content.') self.assertEqual(len(soup.body.contents), 3) extracted = soup.find(id="nav").extract() self.assertEqual( soup.decode(), "Some content. More content.") self.assertEqual(extracted.decode(), ' ') # The extracted tag is now an orphan. self.assertEqual(len(soup.body.contents), 2) self.assertEqual(extracted.parent, None) self.assertEqual(extracted.previous_element, None) self.assertEqual(extracted.next_element.next_element, None) # The gap where the extracted tag used to be has been mended. content_1 = soup.find(text="Some content. ") content_2 = soup.find(text=" More content.") self.assertEqual(content_1.next_element, content_2) self.assertEqual(content_1.next_sibling, content_2) self.assertEqual(content_2.previous_element, content_1) self.assertEqual(content_2.previous_sibling, content_1) def test_extract_distinguishes_between_identical_strings(self): soup = self.soup("foobar") foo_1 = soup.a.string bar_1 = soup.b.string foo_2 = soup.new_string("foo") bar_2 = soup.new_string("bar") soup.a.append(foo_2) soup.b.append(bar_2) # Now there are two identical strings in the tag, and two # in the tag. Let's remove the first "foo" and the second # "bar". foo_1.extract() bar_2.extract() self.assertEqual(foo_2, soup.a.string) self.assertEqual(bar_2, soup.b.string) def test_clear(self): """Tag.clear()""" soup = self.soup("String Italicized and another
") # clear using extract() a = soup.a soup.p.clear() self.assertEqual(len(soup.p.contents), 0) self.assertTrue(hasattr(a, "contents")) # clear using decompose() em = a.em a.clear(decompose=True) self.assertEqual(0, len(em.contents)) def test_string_set(self): """Tag.string = 'string'""" soup = self.soup("