summaryrefslogtreecommitdiffstats
path: root/bitbake/lib/bs4/element.py
diff options
context:
space:
mode:
Diffstat (limited to 'bitbake/lib/bs4/element.py')
-rw-r--r--bitbake/lib/bs4/element.py2219
1 files changed, 1465 insertions, 754 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py
index 68be42d138..0aefe734b2 100644
--- a/bitbake/lib/bs4/element.py
+++ b/bitbake/lib/bs4/element.py
@@ -1,14 +1,27 @@
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
-import collections.abc
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError as e:
+ from collections import Callable
import re
import sys
import warnings
-from bs4.dammit import EntitySubstitution
+
+from bs4.css import CSS
+from bs4.formatter import (
+ Formatter,
+ HTMLFormatter,
+ XMLFormatter,
+)
DEFAULT_OUTPUT_ENCODING = "utf-8"
-PY3K = (sys.version_info[0] > 2)
+nonwhitespace_re = re.compile(r"\S+")
+
+# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
+# the off chance someone imported it for their own use.
whitespace_re = re.compile(r"\s+")
def _alias(attr):
@@ -23,12 +36,49 @@ def _alias(attr):
return alias
+# These encodings are recognized by Python (so PageElement.encode
+# could theoretically support them) but XML and HTML don't recognize
+# them (so they should not show up in an XML or HTML document as that
+# document's encoding).
+#
+# If an XML document is encoded in one of these encodings, no encoding
+# will be mentioned in the XML declaration. If an HTML document is
+# encoded in one of these encodings, and the HTML document has a
+# <meta> tag that mentions an encoding, the encoding will be given as
+# the empty string.
+#
+# Source:
+# https://docs.python.org/3/library/codecs.html#python-specific-encodings
+PYTHON_SPECIFIC_ENCODINGS = set([
+ "idna",
+ "mbcs",
+ "oem",
+ "palmos",
+ "punycode",
+ "raw_unicode_escape",
+ "undefined",
+ "unicode_escape",
+ "raw-unicode-escape",
+ "unicode-escape",
+ "string-escape",
+ "string_escape",
+])
+
+
class NamespacedAttribute(str):
+ """A namespaced string (e.g. 'xml:lang') that remembers the namespace
+ ('xml') and the name ('lang') that were used to create it.
+ """
- def __new__(cls, prefix, name, namespace=None):
- if name is None:
+ def __new__(cls, prefix, name=None, namespace=None):
+ if not name:
+ # This is the default namespace. Its name "has no value"
+ # per https://www.w3.org/TR/xml-names/#defaulting
+ name = None
+
+ if not name:
obj = str.__new__(cls, prefix)
- elif prefix is None:
+ elif not prefix:
# Not really namespaced.
obj = str.__new__(cls, name)
else:
@@ -54,6 +104,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ """When an HTML document is being encoded to a given encoding, the
+ value of a meta tag's 'charset' is the name of the encoding.
+ """
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
return encoding
@@ -79,118 +134,44 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return obj
def encode(self, encoding):
+ if encoding in PYTHON_SPECIFIC_ENCODINGS:
+ return ''
def rewrite(match):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
-class HTMLAwareEntitySubstitution(EntitySubstitution):
-
- """Entity substitution rules that are aware of some HTML quirks.
- Specifically, the contents of <script> and <style> tags should not
- undergo entity substitution.
+class PageElement(object):
+ """Contains the navigational information for some part of the page:
+ that is, its current location in the parse tree.
- Incoming NavigableString objects are checked to see if they're the
- direct children of a <script> or <style> tag.
+ NavigableString, Tag, etc. are all subclasses of PageElement.
"""
- cdata_containing_tags = set(["script", "style"])
+ # In general, we can't tell just by looking at an element whether
+ # it's contained in an XML document or an HTML document. But for
+ # Tags (q.v.) we can store this information at parse time.
+ known_xml = None
- preformatted_tags = set(["pre"])
-
- @classmethod
- def _substitute_if_appropriate(cls, ns, f):
- if (isinstance(ns, NavigableString)
- and ns.parent is not None
- and ns.parent.name in cls.cdata_containing_tags):
- # Do nothing.
- return ns
- # Substitute.
- return f(ns)
+ def setup(self, parent=None, previous_element=None, next_element=None,
+ previous_sibling=None, next_sibling=None):
+ """Sets up the initial relations between this element and
+ other elements.
- @classmethod
- def substitute_html(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_html)
+ :param parent: The parent of this element.
- @classmethod
- def substitute_xml(cls, ns):
- return cls._substitute_if_appropriate(
- ns, EntitySubstitution.substitute_xml)
+ :param previous_element: The element parsed immediately before
+ this one.
-class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
-
- # There are five possible values for the "formatter" argument passed in
- # to methods like encode() and prettify():
- #
- # "html" - All Unicode characters with corresponding HTML entities
- # are converted to those entities on output.
- # "minimal" - Bare ampersands and angle brackets are converted to
- # XML entities: &amp; &lt; &gt;
- # None - The null formatter. Unicode characters are never
- # converted to entities. This is not recommended, but it's
- # faster than "minimal".
- # A function - This function will be called on every string that
- # needs to undergo entity substitution.
- #
-
- # In an HTML document, the default "html" and "minimal" functions
- # will leave the contents of <script> and <style> tags alone. For
- # an XML document, all tags will be given the same treatment.
-
- HTML_FORMATTERS = {
- "html" : HTMLAwareEntitySubstitution.substitute_html,
- "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
- None : None
- }
-
- XML_FORMATTERS = {
- "html" : EntitySubstitution.substitute_html,
- "minimal" : EntitySubstitution.substitute_xml,
- None : None
- }
-
- def format_string(self, s, formatter='minimal'):
- """Format the given string using the given formatter."""
- if not isinstance(formatter, collections.abc.Callable):
- formatter = self._formatter_for_name(formatter)
- if formatter is None:
- output = s
- else:
- output = formatter(s)
- return output
+ :param next_element: The element parsed immediately before
+ this one.
- @property
- def _is_xml(self):
- """Is this element part of an XML tree or an HTML tree?
+ :param previous_sibling: The most recently encountered element
+ on the same level of the parse tree as this one.
- This is used when mapping a formatter name ("minimal") to an
- appropriate function (one that performs entity-substitution on
- the contents of <script> and <style> tags, or not). It's
- inefficient, but it should be called very rarely.
+ :param previous_sibling: The next element to be encountered
+ on the same level of the parse tree as this one.
"""
- if self.parent is None:
- # This is the top-level object. It should have .is_xml set
- # from tree creation. If not, take a guess--BS is usually
- # used on HTML markup.
- return getattr(self, 'is_xml', False)
- return self.parent._is_xml
-
- def _formatter_for_name(self, name):
- "Look up a formatter function based on its name and the tree."
- if self._is_xml:
- return self.XML_FORMATTERS.get(
- name, EntitySubstitution.substitute_xml)
- else:
- return self.HTML_FORMATTERS.get(
- name, HTMLAwareEntitySubstitution.substitute_xml)
-
- def setup(self, parent=None, previous_element=None, next_element=None,
- previous_sibling=None, next_sibling=None):
- """Sets up the initial relations between this element and
- other elements."""
self.parent = parent
self.previous_element = previous_element
@@ -198,48 +179,156 @@ class PageElement(object):
self.previous_element.next_element = self
self.next_element = next_element
- if self.next_element:
+ if self.next_element is not None:
self.next_element.previous_element = self
self.next_sibling = next_sibling
- if self.next_sibling:
+ if self.next_sibling is not None:
self.next_sibling.previous_sibling = self
- if (not previous_sibling
+ if (previous_sibling is None
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
- if previous_sibling:
+ if previous_sibling is not None:
self.previous_sibling.next_sibling = self
+ def format_string(self, s, formatter):
+ """Format the given string using the given formatter.
+
+ :param s: A string.
+ :param formatter: A Formatter object, or a string naming one of the standard formatters.
+ """
+ if formatter is None:
+ return s
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
+ output = formatter.substitute(s)
+ return output
+
+ def formatter_for_name(self, formatter):
+ """Look up or create a Formatter for the given identifier,
+ if necessary.
+
+ :param formatter: Can be a Formatter object (used as-is), a
+ function (used as the entity substitution hook for an
+ XMLFormatter or HTMLFormatter), or a string (used to look
+ up an XMLFormatter or HTMLFormatter in the appropriate
+ registry.
+ """
+ if isinstance(formatter, Formatter):
+ return formatter
+ if self._is_xml:
+ c = XMLFormatter
+ else:
+ c = HTMLFormatter
+ if isinstance(formatter, Callable):
+ return c(entity_substitution=formatter)
+ return c.REGISTRY[formatter]
+
+ @property
+ def _is_xml(self):
+ """Is this element part of an XML tree or an HTML tree?
+
+ This is used in formatter_for_name, when deciding whether an
+ XMLFormatter or HTMLFormatter is more appropriate. It can be
+ inefficient, but it should be called very rarely.
+ """
+ if self.known_xml is not None:
+ # Most of the time we will have determined this when the
+ # document is parsed.
+ return self.known_xml
+
+ # Otherwise, it's likely that this element was created by
+ # direct invocation of the constructor from within the user's
+ # Python code.
+ if self.parent is None:
+ # This is the top-level object. It should have .known_xml set
+ # from tree creation. If not, take a guess--BS is usually
+ # used on HTML markup.
+ return getattr(self, 'is_xml', False)
+ return self.parent._is_xml
+
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
- def replace_with(self, replace_with):
- if not self.parent:
+ default = object()
+ def _all_strings(self, strip=False, types=default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This is implemented differently in Tag and NavigableString.
+ """
+ raise NotImplementedError()
+
+ @property
+ def stripped_strings(self):
+ """Yield all strings in this PageElement, stripping them first.
+
+ :yield: A sequence of stripped strings.
+ """
+ for string in self._all_strings(True):
+ yield string
+
+ def get_text(self, separator="", strip=False,
+ types=default):
+ """Get all child strings of this PageElement, concatenated using the
+ given separator.
+
+ :param separator: Strings will be concatenated using this separator.
+
+ :param strip: If True, strings will be stripped before being
+ concatenated.
+
+ :param types: A tuple of NavigableString subclasses. Any
+ strings of a subclass not found in this list will be
+ ignored. Although there are exceptions, the default
+ behavior in most cases is to consider only NavigableString
+ and CData objects. That means no comments, processing
+ instructions, etc.
+
+ :return: A string.
+ """
+ return separator.join([s for s in self._all_strings(
+ strip, types=types)])
+ getText = get_text
+ text = property(get_text)
+
+ def replace_with(self, *args):
+ """Replace this PageElement with one or more PageElements, keeping the
+ rest of the tree the same.
+
+ :param args: One or more PageElements.
+ :return: `self`, no longer part of the tree.
+ """
+ if self.parent is None:
raise ValueError(
- "Cannot replace one element with another when the"
+ "Cannot replace one element with another when the "
"element to be replaced is not part of a tree.")
- if replace_with is self:
+ if len(args) == 1 and args[0] is self:
return
- if replace_with is self.parent:
+ if any(x is self.parent for x in args):
raise ValueError("Cannot replace a Tag with its parent.")
old_parent = self.parent
my_index = self.parent.index(self)
- self.extract()
- old_parent.insert(my_index, replace_with)
+ self.extract(_self_index=my_index)
+ for idx, replace_with in enumerate(args, start=my_index):
+ old_parent.insert(idx, replace_with)
return self
replaceWith = replace_with # BS3
def unwrap(self):
+ """Replace this PageElement with its contents.
+
+ :return: `self`, no longer part of the tree.
+ """
my_parent = self.parent
- if not self.parent:
+ if self.parent is None:
raise ValueError(
"Cannot replace an element with its contents when that"
"element is not part of a tree.")
my_index = self.parent.index(self)
- self.extract()
+ self.extract(_self_index=my_index)
for child in reversed(self.contents[:]):
my_parent.insert(my_index, child)
return self
@@ -247,14 +336,29 @@ class PageElement(object):
replaceWithChildren = unwrap # BS3
def wrap(self, wrap_inside):
+ """Wrap this PageElement inside another one.
+
+ :param wrap_inside: A PageElement.
+ :return: `wrap_inside`, occupying the position in the tree that used
+ to be occupied by `self`, and with `self` inside it.
+ """
me = self.replace_with(wrap_inside)
wrap_inside.append(me)
return wrap_inside
- def extract(self):
- """Destructively rips this element out of the tree."""
+ def extract(self, _self_index=None):
+ """Destructively rips this element out of the tree.
+
+ :param _self_index: The location of this element in its parent's
+ .contents, if known. Passing this in allows for a performance
+ optimization.
+
+ :return: `self`, no longer part of the tree.
+ """
if self.parent is not None:
- del self.parent.contents[self.parent.index(self)]
+ if _self_index is None:
+ _self_index = self.parent.index(self)
+ del self.parent.contents[_self_index]
#Find the two elements that would be next to each other if
#this element (and any children) hadn't been parsed. Connect
@@ -281,8 +385,13 @@ class PageElement(object):
return self
def _last_descendant(self, is_initialized=True, accept_self=True):
- "Finds the last element beneath this object to be parsed."
- if is_initialized and self.next_sibling:
+ """Finds the last element beneath this object to be parsed.
+
+ :param is_initialized: Has `setup` been called on this PageElement
+ yet?
+ :param accept_self: Is `self` an acceptable answer to the question?
+ """
+ if is_initialized and self.next_sibling is not None:
last_child = self.next_sibling.previous_element
else:
last_child = self
@@ -295,6 +404,14 @@ class PageElement(object):
_lastRecursiveChild = _last_descendant
def insert(self, position, new_child):
+ """Insert a new PageElement in the list of this PageElement's children.
+
+ This works the same way as `list.insert`.
+
+ :param position: The numeric position that should be occupied
+ in `self.children` by the new PageElement.
+ :param new_child: A PageElement.
+ """
if new_child is None:
raise ValueError("Cannot insert None into a tag.")
if new_child is self:
@@ -303,6 +420,14 @@ class PageElement(object):
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
+ from bs4 import BeautifulSoup
+ if isinstance(new_child, BeautifulSoup):
+ # We don't want to end up with a situation where one BeautifulSoup
+ # object contains another. Insert the children one at a time.
+ for subchild in list(new_child.contents):
+ self.insert(position, subchild)
+ position += 1
+ return
position = min(position, len(self.contents))
if hasattr(new_child, 'parent') and new_child.parent is not None:
# We're 'inserting' an element that's already one
@@ -361,160 +486,326 @@ class PageElement(object):
self.contents.insert(position, new_child)
def append(self, tag):
- """Appends the given tag to the contents of this tag."""
+ """Appends the given PageElement to the contents of this one.
+
+ :param tag: A PageElement.
+ """
self.insert(len(self.contents), tag)
- def insert_before(self, predecessor):
- """Makes the given element the immediate predecessor of this one.
+ def extend(self, tags):
+ """Appends the given PageElements to this one's contents.
- The two elements will have the same parent, and the given element
+ :param tags: A list of PageElements. If a single Tag is
+ provided instead, this PageElement's contents will be extended
+ with that Tag's contents.
+ """
+ if isinstance(tags, Tag):
+ tags = tags.contents
+ if isinstance(tags, list):
+ # Moving items around the tree may change their position in
+ # the original list. Make a list that won't change.
+ tags = list(tags)
+ for tag in tags:
+ self.append(tag)
+
+ def insert_before(self, *args):
+ """Makes the given element(s) the immediate predecessor of this one.
+
+ All the elements will have the same parent, and the given elements
will be immediately before this one.
+
+ :param args: One or more PageElements.
"""
- if self is predecessor:
- raise ValueError("Can't insert an element before itself.")
parent = self.parent
if parent is None:
raise ValueError(
"Element has no parent, so 'before' has no meaning.")
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(predecessor, PageElement):
- predecessor.extract()
- index = parent.index(self)
- parent.insert(index, predecessor)
-
- def insert_after(self, successor):
- """Makes the given element the immediate successor of this one.
-
- The two elements will have the same parent, and the given element
+ if any(x is self for x in args):
+ raise ValueError("Can't insert an element before itself.")
+ for predecessor in args:
+ # Extract first so that the index won't be screwed up if they
+ # are siblings.
+ if isinstance(predecessor, PageElement):
+ predecessor.extract()
+ index = parent.index(self)
+ parent.insert(index, predecessor)
+
+ def insert_after(self, *args):
+ """Makes the given element(s) the immediate successor of this one.
+
+ The elements will have the same parent, and the given elements
will be immediately after this one.
+
+ :param args: One or more PageElements.
"""
- if self is successor:
- raise ValueError("Can't insert an element after itself.")
+ # Do all error checking before modifying the tree.
parent = self.parent
if parent is None:
raise ValueError(
"Element has no parent, so 'after' has no meaning.")
- # Extract first so that the index won't be screwed up if they
- # are siblings.
- if isinstance(successor, PageElement):
- successor.extract()
- index = parent.index(self)
- parent.insert(index+1, successor)
-
- def find_next(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears after this Tag in the document."""
- return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
+ if any(x is self for x in args):
+ raise ValueError("Can't insert an element after itself.")
+
+ offset = 0
+ for successor in args:
+ # Extract first so that the index won't be screwed up if they
+ # are siblings.
+ if isinstance(successor, PageElement):
+ successor.extract()
+ index = parent.index(self)
+ parent.insert(index+1+offset, successor)
+ offset += 1
+
+ def find_next(self, name=None, attrs={}, string=None, **kwargs):
+ """Find the first PageElement that matches the given criteria and
+ appears later in the document than this PageElement.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
+ return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
findNext = find_next # BS3
- def find_all_next(self, name=None, attrs={}, text=None, limit=None,
+ def find_all_next(self, name=None, attrs={}, string=None, limit=None,
**kwargs):
- """Returns all items that match the given criteria and appear
- after this Tag in the document."""
- return self._find_all(name, attrs, text, limit, self.next_elements,
- **kwargs)
+ """Find all PageElements that match the given criteria and appear
+ later in the document than this PageElement.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet containing PageElements.
+ """
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(name, attrs, string, limit, self.next_elements,
+ _stacklevel=_stacklevel+1, **kwargs)
findAllNext = find_all_next # BS3
- def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears after this Tag in the document."""
- return self._find_one(self.find_next_siblings, name, attrs, text,
+ def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
+ """Find the closest sibling to this PageElement that matches the
+ given criteria and appears later in the document.
+
+ All find_* methods take a common set of arguments. See the
+ online documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
+ return self._find_one(self.find_next_siblings, name, attrs, string,
**kwargs)
findNextSibling = find_next_sibling # BS3
- def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
+ def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
**kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear after this Tag in the document."""
- return self._find_all(name, attrs, text, limit,
- self.next_siblings, **kwargs)
+ """Find all siblings of this PageElement that match the given criteria
+ and appear later in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit,
+ self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
+ )
findNextSiblings = find_next_siblings # BS3
fetchNextSiblings = find_next_siblings # BS2
- def find_previous(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears before this Tag in the document."""
+ def find_previous(self, name=None, attrs={}, string=None, **kwargs):
+ """Look backwards in the document from this PageElement and find the
+ first PageElement that matches the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self._find_one(
- self.find_all_previous, name, attrs, text, **kwargs)
+ self.find_all_previous, name, attrs, string, **kwargs)
findPrevious = find_previous # BS3
- def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
+ def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
**kwargs):
- """Returns all items that match the given criteria and appear
- before this Tag in the document."""
- return self._find_all(name, attrs, text, limit, self.previous_elements,
- **kwargs)
+ """Look backwards in the document from this PageElement and find all
+ PageElements that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit, self.previous_elements,
+ _stacklevel=_stacklevel+1, **kwargs
+ )
findAllPrevious = find_all_previous # BS3
fetchPrevious = find_all_previous # BS2
- def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears before this Tag in the document."""
- return self._find_one(self.find_previous_siblings, name, attrs, text,
+ def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
+ """Returns the closest sibling to this PageElement that matches the
+ given criteria and appears earlier in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
+ return self._find_one(self.find_previous_siblings, name, attrs, string,
**kwargs)
findPreviousSibling = find_previous_sibling # BS3
- def find_previous_siblings(self, name=None, attrs={}, text=None,
+ def find_previous_siblings(self, name=None, attrs={}, string=None,
limit=None, **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear before this Tag in the document."""
- return self._find_all(name, attrs, text, limit,
- self.previous_siblings, **kwargs)
+ """Returns all siblings to this PageElement that match the
+ given criteria and appear earlier in the document.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(
+ name, attrs, string, limit,
+ self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
+ )
findPreviousSiblings = find_previous_siblings # BS3
fetchPreviousSiblings = find_previous_siblings # BS2
def find_parent(self, name=None, attrs={}, **kwargs):
- """Returns the closest parent of this Tag that matches the given
- criteria."""
+ """Find the closest parent of this PageElement that matches the given
+ criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :kwargs: A dictionary of filters on attribute values.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
# NOTE: We can't use _find_one because findParents takes a different
# set of arguments.
r = None
- l = self.find_parents(name, attrs, 1, **kwargs)
+ l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
if l:
r = l[0]
return r
findParent = find_parent # BS3
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
- """Returns the parents of this Tag that match the given
- criteria."""
+ """Find all parents of this PageElement that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
+ _stacklevel = kwargs.pop('_stacklevel', 2)
return self._find_all(name, attrs, None, limit, self.parents,
- **kwargs)
+ _stacklevel=_stacklevel+1, **kwargs)
findParents = find_parents # BS3
fetchParents = find_parents # BS2
@property
def next(self):
+ """The PageElement, if any, that was parsed just after this one.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self.next_element
@property
def previous(self):
+ """The PageElement, if any, that was parsed just before this one.
+
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
return self.previous_element
#These methods do the real heavy lifting.
- def _find_one(self, method, name, attrs, text, **kwargs):
+ def _find_one(self, method, name, attrs, string, **kwargs):
r = None
- l = method(name, attrs, text, 1, **kwargs)
+ l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
if l:
r = l[0]
return r
- def _find_all(self, name, attrs, text, limit, generator, **kwargs):
+ def _find_all(self, name, attrs, string, limit, generator, **kwargs):
"Iterates over a generator looking for things that match."
+ _stacklevel = kwargs.pop('_stacklevel', 3)
- if text is None and 'string' in kwargs:
- text = kwargs['string']
- del kwargs['string']
+ if string is None and 'text' in kwargs:
+ string = kwargs.pop('text')
+ warnings.warn(
+ "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
+ DeprecationWarning, stacklevel=_stacklevel
+ )
if isinstance(name, SoupStrainer):
strainer = name
else:
- strainer = SoupStrainer(name, attrs, text, **kwargs)
+ strainer = SoupStrainer(name, attrs, string, **kwargs)
- if text is None and not limit and not attrs and not kwargs:
+ if string is None and not limit and not attrs and not kwargs:
if name is True or name is None:
# Optimization to find all tags.
result = (element for element in generator
@@ -522,9 +813,23 @@ class PageElement(object):
return ResultSet(strainer, result)
elif isinstance(name, str):
# Optimization to find all tags with a given name.
+ if name.count(':') == 1:
+ # This is a name with a prefix. If this is a namespace-aware document,
+ # we need to match the local name against tag.name. If not,
+ # we need to match the fully-qualified name against tag.name.
+ prefix, local_name = name.split(':', 1)
+ else:
+ prefix = None
+ local_name = name
result = (element for element in generator
if isinstance(element, Tag)
- and element.name == name)
+ and (
+ element.name == name
+ ) or (
+ element.name == local_name
+ and (prefix is None or element.prefix == prefix)
+ )
+ )
return ResultSet(strainer, result)
results = ResultSet(strainer)
while True:
@@ -544,6 +849,10 @@ class PageElement(object):
#NavigableStrings and Tags.
@property
def next_elements(self):
+ """All PageElements that were parsed after this one.
+
+ :yield: A sequence of PageElements.
+ """
i = self.next_element
while i is not None:
yield i
@@ -551,6 +860,11 @@ class PageElement(object):
@property
def next_siblings(self):
+ """All PageElements that are siblings of this one but were parsed
+ later.
+
+ :yield: A sequence of PageElements.
+ """
i = self.next_sibling
while i is not None:
yield i
@@ -558,6 +872,10 @@ class PageElement(object):
@property
def previous_elements(self):
+ """All PageElements that were parsed before this one.
+
+ :yield: A sequence of PageElements.
+ """
i = self.previous_element
while i is not None:
yield i
@@ -565,6 +883,11 @@ class PageElement(object):
@property
def previous_siblings(self):
+ """All PageElements that are siblings of this one but were parsed
+ earlier.
+
+ :yield: A sequence of PageElements.
+ """
i = self.previous_sibling
while i is not None:
yield i
@@ -572,87 +895,23 @@ class PageElement(object):
@property
def parents(self):
+ """All PageElements that are parents of this PageElement.
+
+ :yield: A sequence of PageElements.
+ """
i = self.parent
while i is not None:
yield i
i = i.parent
- # Methods for supporting CSS selectors.
-
- tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
-
- # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
- # \---------------------------/ \---/\-------------/ \-------/
- # | | | |
- # | | | The value
- # | | ~,|,^,$,* or =
- # | Attribute
- # Tag
- attribselect_re = re.compile(
- r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
- r'=?"?(?P<value>[^\]"]*)"?\]$'
- )
-
- def _attr_value_as_string(self, value, default=None):
- """Force an attribute value into a string representation.
+ @property
+ def decomposed(self):
+ """Check whether a PageElement has been decomposed.
- A multi-valued attribute will be converted into a
- space-separated stirng.
+ :rtype: bool
"""
- value = self.get(value, default)
- if isinstance(value, list) or isinstance(value, tuple):
- value =" ".join(value)
- return value
-
- def _tag_name_matches_and(self, function, tag_name):
- if not tag_name:
- return function
- else:
- def _match(tag):
- return tag.name == tag_name and function(tag)
- return _match
-
- def _attribute_checker(self, operator, attribute, value=''):
- """Create a function that performs a CSS selector operation.
-
- Takes an operator, attribute and optional value. Returns a
- function that will return True for elements that match that
- combination.
- """
- if operator == '=':
- # string representation of `attribute` is equal to `value`
- return lambda el: el._attr_value_as_string(attribute) == value
- elif operator == '~':
- # space-separated list representation of `attribute`
- # contains `value`
- def _includes_value(element):
- attribute_value = element.get(attribute, [])
- if not isinstance(attribute_value, list):
- attribute_value = attribute_value.split()
- return value in attribute_value
- return _includes_value
- elif operator == '^':
- # string representation of `attribute` starts with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').startswith(value)
- elif operator == '$':
- # string represenation of `attribute` ends with `value`
- return lambda el: el._attr_value_as_string(
- attribute, '').endswith(value)
- elif operator == '*':
- # string representation of `attribute` contains `value`
- return lambda el: value in el._attr_value_as_string(attribute, '')
- elif operator == '|':
- # string representation of `attribute` is either exactly
- # `value` or starts with `value` and then a dash.
- def _is_or_starts_with_dash(element):
- attribute_value = element._attr_value_as_string(attribute, '')
- return (attribute_value == value or attribute_value.startswith(
- value + '-'))
- return _is_or_starts_with_dash
- else:
- return lambda el: el.has_attr(attribute)
-
+ return getattr(self, '_decomposed', False) or False
+
# Old non-property versions of the generators, for backwards
# compatibility with BS3.
def nextGenerator(self):
@@ -672,6 +931,11 @@ class PageElement(object):
class NavigableString(str, PageElement):
+ """A Python Unicode string that is part of a parse tree.
+
+ When Beautiful Soup parses the markup <b>penguin</b>, it will
+ create a NavigableString for the string "penguin".
+ """
PREFIX = ''
SUFFIX = ''
@@ -691,12 +955,22 @@ class NavigableString(str, PageElement):
u.setup()
return u
- def __copy__(self):
+ def __deepcopy__(self, memo, recursive=False):
"""A copy of a NavigableString has the same contents and class
as the original, but it is not connected to the parse tree.
+
+ :param recursive: This parameter is ignored; it's only defined
+ so that NavigableString.__deepcopy__ implements the same
+ signature as Tag.__deepcopy__.
"""
return type(self)(self)
+ def __copy__(self):
+ """A copy of a NavigableString can only be a deep copy, because
+ only one PageElement can occupy a given place in a parse tree.
+ """
+ return self.__deepcopy__({})
+
def __getnewargs__(self):
return (str(self),)
@@ -712,55 +986,146 @@ class NavigableString(str, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
+ """Run the string through the provided formatter.
+
+ :param formatter: A Formatter object, or a string naming one of the standard formatters.
+ """
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@property
def name(self):
+ """Since a NavigableString is not a Tag, it has no .name.
+
+ This property is implemented so that code like this doesn't crash
+ when run on a mixture of Tag and NavigableString objects:
+ [x.name for x in tag.children]
+ """
return None
@name.setter
def name(self, name):
+ """Prevent NavigableString.name from ever being set."""
raise AttributeError("A NavigableString cannot be given a name.")
+ def _all_strings(self, strip=False, types=PageElement.default):
+ """Yield all strings of certain classes, possibly stripping them.
+
+ This makes it easy for NavigableString to implement methods
+ like get_text() as conveniences, creating a consistent
+ text-extraction API across all PageElements.
+
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :param types: A tuple of NavigableString subclasses. If this
+ NavigableString isn't one of those subclasses, the
+ sequence will be empty. By default, the subclasses
+ considered are NavigableString and CData objects. That
+ means no comments, processing instructions, etc.
+
+ :yield: A sequence that either contains this string, or is empty.
+
+ """
+ if types is self.default:
+ # This is kept in Tag because it's full of subclasses of
+ # this class, which aren't defined until later in the file.
+ types = Tag.DEFAULT_INTERESTING_STRING_TYPES
+
+ # Do nothing if the caller is looking for specific types of
+ # string, and we're of a different type.
+ #
+ # We check specific types instead of using isinstance(self,
+ # types) because all of these classes subclass
+ # NavigableString. Anyone who's using this feature probably
+ # wants generic NavigableStrings but not other stuff.
+ my_type = type(self)
+ if types is not None:
+ if isinstance(types, type):
+ # Looking for a single type.
+ if my_type is not types:
+ return
+ elif my_type not in types:
+ # Looking for one of a list of types.
+ return
+
+ value = self
+ if strip:
+ value = value.strip()
+ if len(value) > 0:
+ yield value
+ strings = property(_all_strings)
+
class PreformattedString(NavigableString):
"""A NavigableString not subject to the normal formatting rules.
- The string will be passed into the formatter (to trigger side effects),
- but the return value will be ignored.
+ This is an abstract class used for special kinds of strings such
+ as comments (the Comment class) and CDATA blocks (the CData
+ class).
"""
- def output_ready(self, formatter="minimal"):
- """CData strings are passed into the formatter.
- But the return value is ignored."""
- self.format_string(self, formatter)
+ PREFIX = ''
+ SUFFIX = ''
+
+ def output_ready(self, formatter=None):
+ """Make this string ready for output by adding any subclass-specific
+ prefix or suffix.
+
+ :param formatter: A Formatter object, or a string naming one
+ of the standard formatters. The string will be passed into the
+ Formatter, but only to trigger any side effects: the return
+ value is ignored.
+
+ :return: The string, with any subclass-specific prefix and
+ suffix added on.
+ """
+ if formatter is not None:
+ ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
-
+ """A CDATA block."""
PREFIX = '<![CDATA['
SUFFIX = ']]>'
class ProcessingInstruction(PreformattedString):
+ """A SGML processing instruction."""
PREFIX = '<?'
SUFFIX = '>'
-class Comment(PreformattedString):
+class XMLProcessingInstruction(ProcessingInstruction):
+ """An XML processing instruction."""
+ PREFIX = '<?'
+ SUFFIX = '?>'
+class Comment(PreformattedString):
+ """An HTML or XML comment."""
PREFIX = '<!--'
SUFFIX = '-->'
class Declaration(PreformattedString):
+ """An XML declaration."""
PREFIX = '<?'
SUFFIX = '?>'
class Doctype(PreformattedString):
-
+ """A document type declaration."""
@classmethod
def for_name_and_ids(cls, name, pub_id, system_id):
+ """Generate an appropriate document type declaration for a given
+ public ID and system ID.
+
+ :param name: The name of the document's root element, e.g. 'html'.
+ :param pub_id: The Formal Public Identifier for this document type,
+ e.g. '-//W3C//DTD XHTML 1.1//EN'
+ :param system_id: The system identifier for this document type,
+ e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
+
+ :return: A Doctype.
+ """
value = name or ''
if pub_id is not None:
value += ' PUBLIC "%s"' % pub_id
@@ -775,14 +1140,105 @@ class Doctype(PreformattedString):
SUFFIX = '>\n'
+class Stylesheet(NavigableString):
+ """A NavigableString representing an stylesheet (probably
+ CSS).
+
+ Used to distinguish embedded stylesheets from textual content.
+ """
+ pass
+
+
+class Script(NavigableString):
+ """A NavigableString representing an executable script (probably
+ Javascript).
+
+ Used to distinguish executable code from textual content.
+ """
+ pass
+
+
+class TemplateString(NavigableString):
+ """A NavigableString representing a string found inside an HTML
+ template embedded in a larger document.
+
+ Used to distinguish such strings from the main body of the document.
+ """
+ pass
+
+
+class RubyTextString(NavigableString):
+ """A NavigableString representing the contents of the <rt> HTML
+ element.
+
+ https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
+
+ Can be used to distinguish such strings from the strings they're
+ annotating.
+ """
+ pass
+
+
+class RubyParenthesisString(NavigableString):
+ """A NavigableString representing the contents of the <rp> HTML
+ element.
+
+ https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
+ """
+ pass
+
+
class Tag(PageElement):
+ """Represents an HTML or XML tag that is part of a parse tree, along
+ with its attributes and contents.
- """Represents a found HTML tag with its attributes and contents."""
+ When Beautiful Soup parses the markup <b>penguin</b>, it will
+ create a Tag object representing the <b> tag.
+ """
def __init__(self, parser=None, builder=None, name=None, namespace=None,
- prefix=None, attrs=None, parent=None, previous=None):
- "Basic constructor."
-
+ prefix=None, attrs=None, parent=None, previous=None,
+ is_xml=None, sourceline=None, sourcepos=None,
+ can_be_empty_element=None, cdata_list_attributes=None,
+ preserve_whitespace_tags=None,
+ interesting_string_types=None,
+ namespaces=None
+ ):
+ """Basic constructor.
+
+ :param parser: A BeautifulSoup object.
+ :param builder: A TreeBuilder.
+ :param name: The name of the tag.
+ :param namespace: The URI of this Tag's XML namespace, if any.
+ :param prefix: The prefix for this Tag's XML namespace, if any.
+ :param attrs: A dictionary of this Tag's attribute values.
+ :param parent: The PageElement to use as this Tag's parent.
+ :param previous: The PageElement that was parsed immediately before
+ this tag.
+ :param is_xml: If True, this is an XML tag. Otherwise, this is an
+ HTML tag.
+ :param sourceline: The line number where this tag was found in its
+ source document.
+ :param sourcepos: The character position within `sourceline` where this
+ tag was found.
+ :param can_be_empty_element: If True, this tag should be
+ represented as <tag/>. If False, this tag should be represented
+ as <tag></tag>.
+ :param cdata_list_attributes: A list of attributes whose values should
+ be treated as CDATA if they ever show up on this tag.
+ :param preserve_whitespace_tags: A list of tag names whose contents
+ should have their whitespace preserved.
+ :param interesting_string_types: This is a NavigableString
+ subclass or a tuple of them. When iterating over this
+ Tag's strings in methods like Tag.strings or Tag.get_text,
+ these are the types of strings that are interesting enough
+ to be considered. The default is to consider
+ NavigableString and CData the only interesting string
+ subtypes.
+ :param namespaces: A dictionary mapping currently active
+ namespace prefixes to URIs. This can be used later to
+ construct CSS selectors.
+ """
if parser is None:
self.parser_class = None
else:
@@ -793,7 +1249,12 @@ class Tag(PageElement):
raise ValueError("No value provided for new tag's name.")
self.name = name
self.namespace = namespace
+ self._namespaces = namespaces or {}
self.prefix = prefix
+ if ((not builder or builder.store_line_numbers)
+ and (sourceline is not None or sourcepos is not None)):
+ self.sourceline = sourceline
+ self.sourcepos = sourcepos
if attrs is None:
attrs = {}
elif attrs:
@@ -804,32 +1265,109 @@ class Tag(PageElement):
attrs = dict(attrs)
else:
attrs = dict(attrs)
+
+ # If possible, determine ahead of time whether this tag is an
+ # XML tag.
+ if builder:
+ self.known_xml = builder.is_xml
+ else:
+ self.known_xml = is_xml
self.attrs = attrs
self.contents = []
self.setup(parent, previous)
self.hidden = False
- # Set up any substitutions, such as the charset in a META tag.
- if builder is not None:
+ if builder is None:
+ # In the absence of a TreeBuilder, use whatever values were
+ # passed in here. They're probably None, unless this is a copy of some
+ # other tag.
+ self.can_be_empty_element = can_be_empty_element
+ self.cdata_list_attributes = cdata_list_attributes
+ self.preserve_whitespace_tags = preserve_whitespace_tags
+ self.interesting_string_types = interesting_string_types
+ else:
+ # Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self)
+
+ # Ask the TreeBuilder whether this tag might be an empty-element tag.
self.can_be_empty_element = builder.can_be_empty_element(name)
- else:
- self.can_be_empty_element = False
+
+ # Keep track of the list of attributes of this tag that
+ # might need to be treated as a list.
+ #
+ # For performance reasons, we store the whole data structure
+ # rather than asking the question of every tag. Asking would
+ # require building a new data structure every time, and
+ # (unlike can_be_empty_element), we almost never need
+ # to check this.
+ self.cdata_list_attributes = builder.cdata_list_attributes
+
+ # Keep track of the names that might cause this tag to be treated as a
+ # whitespace-preserved tag.
+ self.preserve_whitespace_tags = builder.preserve_whitespace_tags
+
+ if self.name in builder.string_containers:
+ # This sort of tag uses a special string container
+ # subclass for most of its strings. When we ask the
+ self.interesting_string_types = builder.string_containers[self.name]
+ else:
+ self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
parserClass = _alias("parser_class") # BS3
- def __copy__(self):
- """A copy of a Tag is a new Tag, unconnected to the parse tree.
+ def __deepcopy__(self, memo, recursive=True):
+ """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
- clone = type(self)(None, self.builder, self.name, self.namespace,
- self.nsprefix, self.attrs)
+ clone = self._clone()
+
+ if recursive:
+ # Clone this tag's descendants recursively, but without
+ # making any recursive function calls.
+ tag_stack = [clone]
+ for event, element in self._event_stream(self.descendants):
+ if event is Tag.END_ELEMENT_EVENT:
+ # Stop appending incoming Tags to the Tag that was
+ # just closed.
+ tag_stack.pop()
+ else:
+ descendant_clone = element.__deepcopy__(
+ memo, recursive=False
+ )
+ # Add to its parent's .contents
+ tag_stack[-1].append(descendant_clone)
+
+ if event is Tag.START_ELEMENT_EVENT:
+ # Add the Tag itself to the stack so that its
+ # children will be .appended to it.
+ tag_stack.append(descendant_clone)
+ return clone
+
+ def __copy__(self):
+ """A copy of a Tag must always be a deep copy, because a Tag's
+ children can only have one parent at a time.
+ """
+ return self.__deepcopy__({})
+
+ def _clone(self):
+ """Create a new Tag just like this one, but with no
+ contents and unattached to any parse tree.
+
+ This is the first step in the deepcopy process.
+ """
+ clone = type(self)(
+ None, None, self.name, self.namespace,
+ self.prefix, self.attrs, is_xml=self._is_xml,
+ sourceline=self.sourceline, sourcepos=self.sourcepos,
+ can_be_empty_element=self.can_be_empty_element,
+ cdata_list_attributes=self.cdata_list_attributes,
+ preserve_whitespace_tags=self.preserve_whitespace_tags,
+ interesting_string_types=self.interesting_string_types
+ )
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
- for child in self.contents:
- clone.append(child.__copy__())
return clone
-
+
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)
@@ -850,13 +1388,17 @@ class Tag(PageElement):
@property
def string(self):
- """Convenience property to get the single string within this tag.
+ """Convenience property to get the single string within this
+ PageElement.
- :Return: If this tag has a single string child, return value
- is that string. If this tag has no children, or more than one
- child, return value is None. If this tag has one child tag,
+ TODO It might make sense to have NavigableString.string return
+ itself.
+
+ :return: If this element has a single string child, return
+ value is that string. If this element has one child tag,
return value is the 'string' attribute of the child tag,
- recursively.
+ recursively. If this element is itself a string, has no
+ children, or has more than one child, return value is None.
"""
if len(self.contents) != 1:
return None
@@ -867,57 +1409,75 @@ class Tag(PageElement):
@string.setter
def string(self, string):
+ """Replace this PageElement's contents with `string`."""
self.clear()
self.append(string.__class__(string))
- def _all_strings(self, strip=False, types=(NavigableString, CData)):
+ DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
+ def _all_strings(self, strip=False, types=PageElement.default):
"""Yield all strings of certain classes, possibly stripping them.
- By default, yields only NavigableString and CData objects. So
- no comments, processing instructions, etc.
+ :param strip: If True, all strings will be stripped before being
+ yielded.
+
+ :param types: A tuple of NavigableString subclasses. Any strings of
+ a subclass not found in this list will be ignored. By
+ default, the subclasses considered are the ones found in
+ self.interesting_string_types. If that's not specified,
+ only NavigableString and CData objects will be
+ considered. That means no comments, processing
+ instructions, etc.
+
+ :yield: A sequence of strings.
+
"""
+ if types is self.default:
+ types = self.interesting_string_types
+
for descendant in self.descendants:
- if (
- (types is None and not isinstance(descendant, NavigableString))
- or
- (types is not None and type(descendant) not in types)):
+ if (types is None and not isinstance(descendant, NavigableString)):
+ continue
+ descendant_type = type(descendant)
+ if isinstance(types, type):
+ if descendant_type is not types:
+ # We're not interested in strings of this type.
+ continue
+ elif types is not None and descendant_type not in types:
+ # We're not interested in strings of this type.
continue
if strip:
descendant = descendant.strip()
if len(descendant) == 0:
continue
yield descendant
-
strings = property(_all_strings)
- @property
- def stripped_strings(self):
- for string in self._all_strings(True):
- yield string
+ def decompose(self):
+ """Recursively destroys this PageElement and its children.
- def get_text(self, separator="", strip=False,
- types=(NavigableString, CData)):
- """
- Get all child strings, concatenated using the given separator.
- """
- return separator.join([s for s in self._all_strings(
- strip, types=types)])
- getText = get_text
- text = property(get_text)
+ This element will be removed from the tree and wiped out; so
+ will everything beneath it.
- def decompose(self):
- """Recursively destroys the contents of this tree."""
+ The behavior of a decomposed PageElement is undefined and you
+ should never use one for anything, but if you need to _check_
+ whether an element has been decomposed, you can use the
+ `decomposed` property.
+ """
self.extract()
i = self
while i is not None:
- next = i.next_element
+ n = i.next_element
i.__dict__.clear()
i.contents = []
- i = next
+ i._decomposed = True
+ i = n
def clear(self, decompose=False):
- """
- Extract all children. If decompose is True, decompose instead.
+ """Wipe out all children of this PageElement by calling extract()
+ on them.
+
+ :param decompose: If this is True, decompose() (a more
+ destructive method) will be called instead of extract().
"""
if decompose:
for element in self.contents[:]:
@@ -929,10 +1489,51 @@ class Tag(PageElement):
for element in self.contents[:]:
element.extract()
- def index(self, element):
+ def smooth(self):
+ """Smooth out this element's children by consolidating consecutive
+ strings.
+
+ This makes pretty-printed output look more natural following a
+ lot of operations that modified the tree.
"""
- Find the index of a child by identity, not value. Avoids issues with
- tag.contents.index(element) getting the index of equal elements.
+ # Mark the first position of every pair of children that need
+ # to be consolidated. Do this rather than making a copy of
+ # self.contents, since in most cases very few strings will be
+ # affected.
+ marked = []
+ for i, a in enumerate(self.contents):
+ if isinstance(a, Tag):
+ # Recursively smooth children.
+ a.smooth()
+ if i == len(self.contents)-1:
+ # This is the last item in .contents, and it's not a
+ # tag. There's no chance it needs any work.
+ continue
+ b = self.contents[i+1]
+ if (isinstance(a, NavigableString)
+ and isinstance(b, NavigableString)
+ and not isinstance(a, PreformattedString)
+ and not isinstance(b, PreformattedString)
+ ):
+ marked.append(i)
+
+ # Go over the marked positions in reverse order, so that
+ # removing items from .contents won't affect the remaining
+ # positions.
+ for i in reversed(marked):
+ a = self.contents[i]
+ b = self.contents[i+1]
+ b.extract()
+ n = NavigableString(a+b)
+ a.replace_with(n)
+
+ def index(self, element):
+ """Find the index of a child by identity, not value.
+
+ Avoids issues with tag.contents.index(element) getting the
+ index of equal elements.
+
+ :param element: Look for this PageElement in `self.contents`.
"""
for i, child in enumerate(self.contents):
if child is element:
@@ -945,23 +1546,38 @@ class Tag(PageElement):
attribute."""
return self.attrs.get(key, default)
+ def get_attribute_list(self, key, default=None):
+ """The same as get(), but always returns a list.
+
+ :param key: The attribute to look for.
+ :param default: Use this value if the attribute is not present
+ on this PageElement.
+ :return: A list of values, probably containing only a single
+ value.
+ """
+ value = self.get(key, default)
+ if not isinstance(value, list):
+ value = [value]
+ return value
+
def has_attr(self, key):
+ """Does this PageElement have an attribute with the given name?"""
return key in self.attrs
def __hash__(self):
return str(self).__hash__()
def __getitem__(self, key):
- """tag[key] returns the value of the 'key' attribute for the tag,
+ """tag[key] returns the value of the 'key' attribute for the Tag,
and throws an exception if it's not there."""
return self.attrs[key]
def __iter__(self):
- "Iterating over a tag iterates over its contents."
+ "Iterating over a Tag iterates over its contents."
return iter(self.contents)
def __len__(self):
- "The length of a tag is the length of its list of contents."
+ "The length of a Tag is the length of its list of contents."
return len(self.contents)
def __contains__(self, x):
@@ -981,29 +1597,33 @@ class Tag(PageElement):
self.attrs.pop(key, None)
def __call__(self, *args, **kwargs):
- """Calling a tag like a function is the same as calling its
+ """Calling a Tag like a function is the same as calling its
find_all() method. Eg. tag('a') returns a list of all the A tags
found within this tag."""
return self.find_all(*args, **kwargs)
def __getattr__(self, tag):
- #print "Getattr %s.%s" % (self.__class__, tag)
+ """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
+ #print("Getattr %s.%s" % (self.__class__, tag))
if len(tag) > 3 and tag.endswith('Tag'):
# BS3: soup.aTag -> "soup.find("a")
tag_name = tag[:-3]
warnings.warn(
- '.%sTag is deprecated, use .find("%s") instead.' % (
- tag_name, tag_name))
+ '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
+ name=tag_name
+ ),
+ DeprecationWarning, stacklevel=2
+ )
return self.find(tag_name)
# We special case contents to avoid recursion.
- elif not tag.startswith("__") and not tag=="contents":
+ elif not tag.startswith("__") and not tag == "contents":
return self.find(tag)
raise AttributeError(
"'%s' object has no attribute '%s'" % (self.__class__, tag))
def __eq__(self, other):
- """Returns true iff this tag has the same name, the same attributes,
- and the same contents (recursively) as the given tag."""
+ """Returns true iff this Tag has the same name, the same attributes,
+ and the same contents (recursively) as `other`."""
if self is other:
return True
if (not hasattr(other, 'name') or
@@ -1019,69 +1639,235 @@ class Tag(PageElement):
return True
def __ne__(self, other):
- """Returns true iff this tag is not identical to the other tag,
+ """Returns true iff this Tag is not identical to `other`,
as defined in __eq__."""
return not self == other
def __repr__(self, encoding="unicode-escape"):
- """Renders this tag as a string."""
- if PY3K:
- # "The return value must be a string object", i.e. Unicode
- return self.decode()
- else:
- # "The return value must be a string object", i.e. a bytestring.
- # By convention, the return value of __repr__ should also be
- # an ASCII string.
- return self.encode(encoding)
+ """Renders this PageElement as a string.
- def __unicode__(self):
+ :param encoding: The encoding to use (Python 2 only).
+ TODO: This is now ignored and a warning should be issued
+ if a value is provided.
+ :return: A (Unicode) string.
+ """
+ # "The return value must be a string object", i.e. Unicode
return self.decode()
- def __str__(self):
- if PY3K:
- return self.decode()
- else:
- return self.encode()
+ def __unicode__(self):
+ """Renders this PageElement as a Unicode string."""
+ return self.decode()
- if PY3K:
- __str__ = __repr__ = __unicode__
+ __str__ = __repr__ = __unicode__
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
indent_level=None, formatter="minimal",
errors="xmlcharrefreplace"):
+ """Render a bytestring representation of this PageElement and its
+ contents.
+
+ :param encoding: The destination encoding.
+ :param indent_level: Each line of the rendering will be
+ indented this many levels. (The formatter decides what a
+ 'level' means in terms of spaces or other characters
+ output.) Used internally in recursive calls while
+ pretty-printing.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
+ :param errors: An error handling strategy such as
+ 'xmlcharrefreplace'. This value is passed along into
+ encode() and its value should be one of the constants
+ defined by Python.
+ :return: A bytestring.
+
+ """
# Turn the data structure into Unicode, then encode the
# Unicode.
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
- def _should_pretty_print(self, indent_level):
- """Should this tag be pretty-printed?"""
- return (
- indent_level is not None and
- (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
- or self._is_xml))
-
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Returns a Unicode representation of this tag and its contents.
+ formatter="minimal",
+ iterator=None):
+ pieces = []
+ # First off, turn a non-Formatter `formatter` into a Formatter
+ # object. This will stop the lookup from happening over and
+ # over again.
+ if not isinstance(formatter, Formatter):
+ formatter = self.formatter_for_name(formatter)
+
+ if indent_level is True:
+ indent_level = 0
+
+ # The currently active tag that put us into string literal
+ # mode. Until this element is closed, children will be treated
+ # as string literals and not pretty-printed. String literal
+ # mode is turned on immediately after this tag begins, and
+ # turned off immediately before it's closed. This means there
+ # will be whitespace before and after the tag itself.
+ string_literal_tag = None
+
+ for event, element in self._event_stream(iterator):
+ if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
+ piece = element._format_tag(
+ eventual_encoding, formatter, opening=True
+ )
+ elif event is Tag.END_ELEMENT_EVENT:
+ piece = element._format_tag(
+ eventual_encoding, formatter, opening=False
+ )
+ if indent_level is not None:
+ indent_level -= 1
+ else:
+ piece = element.output_ready(formatter)
+
+ # Now we need to apply the 'prettiness' -- extra
+ # whitespace before and/or after this tag. This can get
+ # complicated because certain tags, like <pre> and
+ # <script>, can't be prettified, since adding whitespace would
+ # change the meaning of the content.
+
+ # The default behavior is to add whitespace before and
+ # after an element when string literal mode is off, and to
+ # leave things as they are when string literal mode is on.
+ if string_literal_tag:
+ indent_before = indent_after = False
+ else:
+ indent_before = indent_after = True
+
+ # The only time the behavior is more complex than that is
+ # when we encounter an opening or closing tag that might
+ # put us into or out of string literal mode.
+ if (event is Tag.START_ELEMENT_EVENT
+ and not string_literal_tag
+ and not element._should_pretty_print()):
+ # We are about to enter string literal mode. Add
+ # whitespace before this tag, but not after. We
+ # will stay in string literal mode until this tag
+ # is closed.
+ indent_before = True
+ indent_after = False
+ string_literal_tag = element
+ elif (event is Tag.END_ELEMENT_EVENT
+ and element is string_literal_tag):
+ # We are about to exit string literal mode by closing
+ # the tag that sent us into that mode. Add whitespace
+ # after this tag, but not before.
+ indent_before = False
+ indent_after = True
+ string_literal_tag = None
+
+ # Now we know whether to add whitespace before and/or
+ # after this element.
+ if indent_level is not None:
+ if (indent_before or indent_after):
+ if isinstance(element, NavigableString):
+ piece = piece.strip()
+ if piece:
+ piece = self._indent_string(
+ piece, indent_level, formatter,
+ indent_before, indent_after
+ )
+ if event == Tag.START_ELEMENT_EVENT:
+ indent_level += 1
+ pieces.append(piece)
+ return "".join(pieces)
+
+ # Names for the different events yielded by _event_stream
+ START_ELEMENT_EVENT = object()
+ END_ELEMENT_EVENT = object()
+ EMPTY_ELEMENT_EVENT = object()
+ STRING_ELEMENT_EVENT = object()
+
+ def _event_stream(self, iterator=None):
+ """Yield a sequence of events that can be used to reconstruct the DOM
+ for this element.
+
+ This lets us recreate the nested structure of this element
+ (e.g. when formatting it as a string) without using recursive
+ method calls.
+
+ This is similar in concept to the SAX API, but it's a simpler
+ interface designed for internal use. The events are different
+ from SAX and the arguments associated with the events are Tags
+ and other Beautiful Soup objects.
+
+ :param iterator: An alternate iterator to use when traversing
+ the tree.
+ """
+ tag_stack = []
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
+ iterator = iterator or self.self_and_descendants
+
+ for c in iterator:
+ # If the parent of the element we're about to yield is not
+ # the tag currently on the stack, it means that the tag on
+ # the stack closed before this element appeared.
+ while tag_stack and c.parent != tag_stack[-1]:
+ now_closed_tag = tag_stack.pop()
+ yield Tag.END_ELEMENT_EVENT, now_closed_tag
+
+ if isinstance(c, Tag):
+ if c.is_empty_element:
+ yield Tag.EMPTY_ELEMENT_EVENT, c
+ else:
+ yield Tag.START_ELEMENT_EVENT, c
+ tag_stack.append(c)
+ continue
+ else:
+ yield Tag.STRING_ELEMENT_EVENT, c
+
+ while tag_stack:
+ now_closed_tag = tag_stack.pop()
+ yield Tag.END_ELEMENT_EVENT, now_closed_tag
+
+ def _indent_string(self, s, indent_level, formatter,
+ indent_before, indent_after):
+ """Add indentation whitespace before and/or after a string.
+
+ :param s: The string to amend with whitespace.
+ :param indent_level: The indentation level; affects how much
+ whitespace goes before the string.
+ :param indent_before: Whether or not to add whitespace
+ before the string.
+ :param indent_after: Whether or not to add whitespace
+ (a newline) after the string.
"""
+ space_before = ''
+ if indent_before and indent_level:
+ space_before = (formatter.indent * indent_level)
- # First off, turn a string formatter into a function. This
- # will stop the lookup from happening over and over again.
- if not isinstance(formatter, collections.abc.Callable):
- formatter = self._formatter_for_name(formatter)
+ space_after = ''
+ if indent_after:
+ space_after = "\n"
- attrs = []
- if self.attrs:
- for key, val in sorted(self.attrs.items()):
+ return space_before + s + space_after
+
+ def _format_tag(self, eventual_encoding, formatter, opening):
+ if self.hidden:
+ # A hidden tag is invisible, although its contents
+ # are visible.
+ return ''
+
+ # A tag starts with the < character (see below).
+
+ # Then the / character, if this is a closing tag.
+ closing_slash = ''
+ if not opening:
+ closing_slash = '/'
+
+ # Then an optional namespace prefix.
+ prefix = ''
+ if self.prefix:
+ prefix = self.prefix + ":"
+
+ # Then a list of attribute values, if this is an opening tag.
+ attribute_string = ''
+ if opening:
+ attributes = formatter.attributes(self)
+ attrs = []
+ for key, val in attributes:
if val is None:
decoded = key
else:
@@ -1090,71 +1876,52 @@ class Tag(PageElement):
elif not isinstance(val, str):
val = str(val)
elif (
- isinstance(val, AttributeValueWithCharsetSubstitution)
- and eventual_encoding is not None):
+ isinstance(val, AttributeValueWithCharsetSubstitution)
+ and eventual_encoding is not None
+ ):
val = val.encode(eventual_encoding)
- text = self.format_string(val, formatter)
+ text = formatter.attribute_value(val)
decoded = (
str(key) + '='
- + EntitySubstitution.quoted_attribute_value(text))
+ + formatter.quoted_attribute_value(text))
attrs.append(decoded)
- close = ''
- closeTag = ''
-
- prefix = ''
- if self.prefix:
- prefix = self.prefix + ":"
+ if attrs:
+ attribute_string = ' ' + ' '.join(attrs)
+ # Then an optional closing slash (for a void element in an
+ # XML document).
+ void_element_closing_slash = ''
if self.is_empty_element:
- close = '/'
- else:
- closeTag = '</%s%s>' % (prefix, self.name)
-
- pretty_print = self._should_pretty_print(indent_level)
- space = ''
- indent_space = ''
- if indent_level is not None:
- indent_space = (' ' * (indent_level - 1))
- if pretty_print:
- space = indent_space
- indent_contents = indent_level + 1
- else:
- indent_contents = None
- contents = self.decode_contents(
- indent_contents, eventual_encoding, formatter)
+ void_element_closing_slash = formatter.void_element_close_prefix or ''
- if self.hidden:
- # This is the 'document root' object.
- s = contents
- else:
- s = []
- attribute_string = ''
- if attrs:
- attribute_string = ' ' + ' '.join(attrs)
- if indent_level is not None:
- # Even if this particular tag is not pretty-printed,
- # we should indent up to the start of the tag.
- s.append(indent_space)
- s.append('<%s%s%s%s>' % (
- prefix, self.name, attribute_string, close))
- if pretty_print:
- s.append("\n")
- s.append(contents)
- if pretty_print and contents and contents[-1] != "\n":
- s.append("\n")
- if pretty_print and closeTag:
- s.append(space)
- s.append(closeTag)
- if indent_level is not None and closeTag and self.next_sibling:
- # Even if this particular tag is not pretty-printed,
- # we're now done with the tag, and we should add a
- # newline if appropriate.
- s.append("\n")
- s = ''.join(s)
- return s
+ # Put it all together.
+ return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
+
+ def _should_pretty_print(self, indent_level=1):
+ """Should this tag be pretty-printed?
+
+ Most of them should, but some (such as <pre> in HTML
+ documents) should not.
+ """
+ return (
+ indent_level is not None
+ and (
+ not self.preserve_whitespace_tags
+ or self.name not in self.preserve_whitespace_tags
+ )
+ )
def prettify(self, encoding=None, formatter="minimal"):
+ """Pretty-print this PageElement as a string.
+
+ :param encoding: The eventual encoding of the string. If this is None,
+ a Unicode string will be returned.
+ :param formatter: A Formatter object, or a string naming one of
+ the standard formatters.
+ :return: A Unicode string (if encoding==None) or a bytestring
+ (otherwise).
+ """
if encoding is None:
return self.decode(True, formatter=formatter)
else:
@@ -1166,62 +1933,50 @@ class Tag(PageElement):
"""Renders the contents of this tag as a Unicode string.
:param indent_level: Each line of the rendering will be
- indented this many spaces.
+ indented this many levels. (The formatter decides what a
+ 'level' means in terms of spaces or other characters
+ output.) Used internally in recursive calls while
+ pretty-printing.
:param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
+ encoded into this encoding. decode_contents() is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
- :param formatter: The output formatter responsible for converting
- entities to Unicode characters.
- """
- # First off, turn a string formatter into a function. This
- # will stop the lookup from happening over and over again.
- if not isinstance(formatter, collections.abc.Callable):
- formatter = self._formatter_for_name(formatter)
-
- pretty_print = (indent_level is not None)
- s = []
- for c in self:
- text = None
- if isinstance(c, NavigableString):
- text = c.output_ready(formatter)
- elif isinstance(c, Tag):
- s.append(c.decode(indent_level, eventual_encoding,
- formatter))
- if text and indent_level and not self.name == 'pre':
- text = text.strip()
- if text:
- if pretty_print and not self.name == 'pre':
- s.append(" " * (indent_level - 1))
- s.append(text)
- if pretty_print and not self.name == 'pre':
- s.append("\n")
- return ''.join(s)
+ :param formatter: A Formatter object, or a string naming one of
+ the standard Formatters.
+
+ """
+ return self.decode(indent_level, eventual_encoding, formatter,
+ iterator=self.descendants)
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
- """Renders the contents of this tag as a bytestring.
+ """Renders the contents of this PageElement as a bytestring.
:param indent_level: Each line of the rendering will be
- indented this many spaces.
+ indented this many levels. (The formatter decides what a
+ 'level' means in terms of spaces or other characters
+ output.) Used internally in recursive calls while
+ pretty-printing.
:param eventual_encoding: The bytestring will be in this encoding.
- :param formatter: The output formatter responsible for converting
- entities to Unicode characters.
- """
+ :param formatter: A Formatter object, or a string naming one of
+ the standard Formatters.
+ :return: A bytestring.
+ """
contents = self.decode_contents(indent_level, encoding, formatter)
return contents.encode(encoding)
# Old method for BS3 compatibility
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
prettyPrint=False, indentLevel=0):
+ """Deprecated method for BS3 compatibility."""
if not prettyPrint:
indentLevel = None
return self.encode_contents(
@@ -1229,44 +1984,88 @@ class Tag(PageElement):
#Soup methods
- def find(self, name=None, attrs={}, recursive=True, text=None,
+ def find(self, name=None, attrs={}, recursive=True, string=None,
**kwargs):
- """Return only the first child of this Tag matching the given
- criteria."""
+ """Look in the children of this PageElement and find the first
+ PageElement that matches the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param recursive: If this is True, find() will perform a
+ recursive search of this PageElement's children. Otherwise,
+ only the direct children will be considered.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A PageElement.
+ :rtype: bs4.element.Tag | bs4.element.NavigableString
+ """
r = None
- l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
+ l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
+ **kwargs)
if l:
r = l[0]
return r
- findChild = find
+ findChild = find #BS2
- def find_all(self, name=None, attrs={}, recursive=True, text=None,
+ def find_all(self, name=None, attrs={}, recursive=True, string=None,
limit=None, **kwargs):
- """Extracts a list of Tag objects that match the given
- criteria. You can specify the name of the Tag and any
- attributes you want the Tag to have.
-
- The value of a key-value pair in the 'attrs' map can be a
- string, a list of strings, a regular expression object, or a
- callable that takes a string and returns whether or not the
- string matches for some custom definition of 'matches'. The
- same is true of the tag name."""
-
+ """Look in the children of this PageElement and find all
+ PageElements that match the given criteria.
+
+ All find_* methods take a common set of arguments. See the online
+ documentation for detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param recursive: If this is True, find_all() will perform a
+ recursive search of this PageElement's children. Otherwise,
+ only the direct children will be considered.
+ :param limit: Stop looking after finding this many results.
+ :kwargs: A dictionary of filters on attribute values.
+ :return: A ResultSet of PageElements.
+ :rtype: bs4.element.ResultSet
+ """
generator = self.descendants
if not recursive:
generator = self.children
- return self._find_all(name, attrs, text, limit, generator, **kwargs)
+ _stacklevel = kwargs.pop('_stacklevel', 2)
+ return self._find_all(name, attrs, string, limit, generator,
+ _stacklevel=_stacklevel+1, **kwargs)
findAll = find_all # BS3
findChildren = find_all # BS2
#Generator methods
@property
def children(self):
+ """Iterate over all direct children of this PageElement.
+
+ :yield: A sequence of PageElements.
+ """
# return iter() to make the purpose of the method clear
return iter(self.contents) # XXX This seems to be untested.
@property
+ def self_and_descendants(self):
+ """Iterate over this PageElement and its children in a
+ breadth-first sequence.
+
+ :yield: A sequence of PageElements.
+ """
+ if not self.hidden:
+ yield self
+ for i in self.descendants:
+ yield i
+
+ @property
def descendants(self):
+ """Iterate over all children of this PageElement in a
+ breadth-first sequence.
+
+ :yield: A sequence of PageElements.
+ """
if not len(self.contents):
return
stopNode = self._last_descendant().next_element
@@ -1276,262 +2075,102 @@ class Tag(PageElement):
current = current.next_element
# CSS selector code
+ def select_one(self, selector, namespaces=None, **kwargs):
+ """Perform a CSS selection operation on the current element.
- _selector_combinators = ['>', '+', '~']
- _select_debug = False
- def select_one(self, selector):
- """Perform a CSS selection operation on the current element."""
- value = self.select(selector, limit=1)
- if value:
- return value[0]
- return None
+ :param selector: A CSS selector.
- def select(self, selector, _candidate_generator=None, limit=None):
- """Perform a CSS selection operation on the current element."""
-
- # Handle grouping selectors if ',' exists, ie: p,a
- if ',' in selector:
- context = []
- for partial_selector in selector.split(','):
- partial_selector = partial_selector.strip()
- if partial_selector == '':
- raise ValueError('Invalid group selection syntax: %s' % selector)
- candidates = self.select(partial_selector, limit=limit)
- for candidate in candidates:
- if candidate not in context:
- context.append(candidate)
-
- if limit and len(context) >= limit:
- break
- return context
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
- tokens = selector.split()
- current_context = [self]
+ :param kwargs: Keyword arguments to be passed into Soup Sieve's
+ soupsieve.select() method.
- if tokens[-1] in self._selector_combinators:
- raise ValueError(
- 'Final combinator "%s" is missing an argument.' % tokens[-1])
+ :return: A Tag.
+ :rtype: bs4.element.Tag
+ """
+ return self.css.select_one(selector, namespaces, **kwargs)
- if self._select_debug:
- print('Running CSS selector "%s"' % selector)
+ def select(self, selector, namespaces=None, limit=None, **kwargs):
+ """Perform a CSS selection operation on the current element.
- for index, token in enumerate(tokens):
- new_context = []
- new_context_ids = set([])
+ This uses the SoupSieve library.
- if tokens[index-1] in self._selector_combinators:
- # This token was consumed by the previous combinator. Skip it.
- if self._select_debug:
- print(' Token was consumed by the previous combinator.')
- continue
+ :param selector: A string containing a CSS selector.
- if self._select_debug:
- print(' Considering token "%s"' % token)
- recursive_candidate_generator = None
- tag_name = None
-
- # Each operation corresponds to a checker function, a rule
- # for determining whether a candidate matches the
- # selector. Candidates are generated by the active
- # iterator.
- checker = None
-
- m = self.attribselect_re.match(token)
- if m is not None:
- # Attribute selector
- tag_name, attribute, operator, value = m.groups()
- checker = self._attribute_checker(operator, attribute, value)
-
- elif '#' in token:
- # ID selector
- tag_name, tag_id = token.split('#', 1)
- def id_matches(tag):
- return tag.get('id', None) == tag_id
- checker = id_matches
-
- elif '.' in token:
- # Class selector
- tag_name, klass = token.split('.', 1)
- classes = set(klass.split('.'))
- def classes_match(candidate):
- return classes.issubset(candidate.get('class', []))
- checker = classes_match
-
- elif ':' in token:
- # Pseudo-class
- tag_name, pseudo = token.split(':', 1)
- if tag_name == '':
- raise ValueError(
- "A pseudo-class must be prefixed with a tag name.")
- pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
- found = []
- if pseudo_attributes is None:
- pseudo_type = pseudo
- pseudo_value = None
- else:
- pseudo_type, pseudo_value = pseudo_attributes.groups()
- if pseudo_type == 'nth-of-type':
- try:
- pseudo_value = int(pseudo_value)
- except:
- raise NotImplementedError(
- 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
- if pseudo_value < 1:
- raise ValueError(
- 'nth-of-type pseudo-class value must be at least 1.')
- class Counter(object):
- def __init__(self, destination):
- self.count = 0
- self.destination = destination
-
- def nth_child_of_type(self, tag):
- self.count += 1
- if self.count == self.destination:
- return True
- if self.count > self.destination:
- # Stop the generator that's sending us
- # these things.
- raise StopIteration()
- return False
- checker = Counter(pseudo_value).nth_child_of_type
- else:
- raise NotImplementedError(
- 'Only the following pseudo-classes are implemented: nth-of-type.')
-
- elif token == '*':
- # Star selector -- matches everything
- pass
- elif token == '>':
- # Run the next token as a CSS selector against the
- # direct children of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.children
- elif token == '~':
- # Run the next token as a CSS selector against the
- # siblings of each tag in the current context.
- recursive_candidate_generator = lambda tag: tag.next_siblings
- elif token == '+':
- # For each tag in the current context, run the next
- # token as a CSS selector against the tag's next
- # sibling that's a tag.
- def next_tag_sibling(tag):
- yield tag.find_next_sibling(True)
- recursive_candidate_generator = next_tag_sibling
-
- elif self.tag_name_re.match(token):
- # Just a tag name.
- tag_name = token
- else:
- raise ValueError(
- 'Unsupported or invalid CSS selector: "%s"' % token)
- if recursive_candidate_generator:
- # This happens when the selector looks like "> foo".
- #
- # The generator calls select() recursively on every
- # member of the current context, passing in a different
- # candidate generator and a different selector.
- #
- # In the case of "> foo", the candidate generator is
- # one that yields a tag's direct children (">"), and
- # the selector is "foo".
- next_token = tokens[index+1]
- def recursive_select(tag):
- if self._select_debug:
- print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
- print('-' * 40)
- for i in tag.select(next_token, recursive_candidate_generator):
- if self._select_debug:
- print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
- yield i
- if self._select_debug:
- print('-' * 40)
- _use_candidate_generator = recursive_select
- elif _candidate_generator is None:
- # By default, a tag's candidates are all of its
- # children. If tag_name is defined, only yield tags
- # with that name.
- if self._select_debug:
- if tag_name:
- check = "[any]"
- else:
- check = tag_name
- print(' Default candidate generator, tag name="%s"' % check)
- if self._select_debug:
- # This is redundant with later code, but it stops
- # a bunch of bogus tags from cluttering up the
- # debug log.
- def default_candidate_generator(tag):
- for child in tag.descendants:
- if not isinstance(child, Tag):
- continue
- if tag_name and not child.name == tag_name:
- continue
- yield child
- _use_candidate_generator = default_candidate_generator
- else:
- _use_candidate_generator = lambda tag: tag.descendants
- else:
- _use_candidate_generator = _candidate_generator
-
- count = 0
- for tag in current_context:
- if self._select_debug:
- print(" Running candidate generator on %s %s" % (
- tag.name, repr(tag.attrs)))
- for candidate in _use_candidate_generator(tag):
- if not isinstance(candidate, Tag):
- continue
- if tag_name and candidate.name != tag_name:
- continue
- if checker is not None:
- try:
- result = checker(candidate)
- except StopIteration:
- # The checker has decided we should no longer
- # run the generator.
- break
- if checker is None or result:
- if self._select_debug:
- print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
- if id(candidate) not in new_context_ids:
- # If a tag matches a selector more than once,
- # don't include it in the context more than once.
- new_context.append(candidate)
- new_context_ids.add(id(candidate))
- if limit and len(new_context) >= limit:
- break
- elif self._select_debug:
- print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
-
-
- current_context = new_context
-
- if self._select_debug:
- print("Final verdict:")
- for i in current_context:
- print(" %s %s" % (i.name, i.attrs))
- return current_context
+ :param namespaces: A dictionary mapping namespace prefixes
+ used in the CSS selector to namespace URIs. By default,
+ Beautiful Soup will use the prefixes it encountered while
+ parsing the document.
+
+ :param limit: After finding this number of results, stop looking.
+
+ :param kwargs: Keyword arguments to be passed into SoupSieve's
+ soupsieve.select() method.
+
+ :return: A ResultSet of Tags.
+ :rtype: bs4.element.ResultSet
+ """
+ return self.css.select(selector, namespaces, limit, **kwargs)
+
+ @property
+ def css(self):
+ """Return an interface to the CSS selector API."""
+ return CSS(self)
# Old names for backwards compatibility
def childGenerator(self):
+ """Deprecated generator."""
return self.children
def recursiveChildGenerator(self):
+ """Deprecated generator."""
return self.descendants
def has_key(self, key):
- """This was kind of misleading because has_key() (attributes)
- was different from __in__ (contents). has_key() is gone in
- Python 3, anyway."""
- warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
- key))
+ """Deprecated method. This was kind of misleading because has_key()
+ (attributes) was different from __in__ (contents).
+
+ has_key() is gone in Python 3, anyway.
+ """
+ warnings.warn(
+ 'has_key is deprecated. Use has_attr(key) instead.',
+ DeprecationWarning, stacklevel=2
+ )
return self.has_attr(key)
# Next, a couple classes to represent queries and their results.
class SoupStrainer(object):
"""Encapsulates a number of ways of matching a markup element (tag or
- text)."""
+ string).
+
+ This is primarily used to underpin the find_* methods, but you can
+ create one yourself and pass it in as `parse_only` to the
+ `BeautifulSoup` constructor, to parse a subset of a large
+ document.
+ """
+
+ def __init__(self, name=None, attrs={}, string=None, **kwargs):
+ """Constructor.
+
+ The SoupStrainer constructor takes the same arguments passed
+ into the find_* methods. See the online documentation for
+ detailed explanations.
+
+ :param name: A filter on tag name.
+ :param attrs: A dictionary of filters on attribute values.
+ :param string: A filter for a NavigableString with specific text.
+ :kwargs: A dictionary of filters on attribute values.
+ """
+ if string is None and 'text' in kwargs:
+ string = kwargs.pop('text')
+ warnings.warn(
+ "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
+ DeprecationWarning, stacklevel=2
+ )
- def __init__(self, name=None, attrs={}, text=None, **kwargs):
self.name = self._normalize_search_value(name)
if not isinstance(attrs, dict):
# Treat a non-dict value for attrs as a search for the 'class'
@@ -1556,12 +2195,15 @@ class SoupStrainer(object):
normalized_attrs[key] = self._normalize_search_value(value)
self.attrs = normalized_attrs
- self.text = self._normalize_search_value(text)
+ self.string = self._normalize_search_value(string)
+
+ # DEPRECATED but just in case someone is checking this.
+ self.text = self.string
def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None.
- if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match')
+ if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None):
return value
@@ -1589,19 +2231,40 @@ class SoupStrainer(object):
return str(str(value))
def __str__(self):
- if self.text:
- return self.text
+ """A human-readable representation of this SoupStrainer."""
+ if self.string:
+ return self.string
else:
return "%s|%s" % (self.name, self.attrs)
def search_tag(self, markup_name=None, markup_attrs={}):
+ """Check whether a Tag with the given name and attributes would
+ match this SoupStrainer.
+
+ Used prospectively to decide whether to even bother creating a Tag
+ object.
+
+ :param markup_name: A tag name as found in some markup.
+ :param markup_attrs: A dictionary of attributes as found in some markup.
+
+ :return: True if the prospective tag would match this SoupStrainer;
+ False otherwise.
+ """
found = None
markup = None
if isinstance(markup_name, Tag):
markup = markup_name
markup_attrs = markup
+
+ if isinstance(self.name, str):
+ # Optimization for a very common case where the user is
+ # searching for a tag with one specific name, and we're
+ # looking at a tag with a different name.
+ if markup and not markup.prefix and self.name != markup.name:
+ return False
+
call_function_with_tag_data = (
- isinstance(self.name, collections.abc.Callable)
+ isinstance(self.name, Callable)
and not isinstance(markup_name, Tag))
if ((not self.name)
@@ -1630,13 +2293,22 @@ class SoupStrainer(object):
found = markup
else:
found = markup_name
- if found and self.text and not self._matches(found.string, self.text):
+ if found and self.string and not self._matches(found.string, self.string):
found = None
return found
+
+ # For BS3 compatibility.
searchTag = search_tag
def search(self, markup):
- # print 'looking for %s in %s' % (self, markup)
+ """Find all items in `markup` that match this SoupStrainer.
+
+ Used by the core _find_all() method, which is ultimately
+ called by all find_* methods.
+
+ :param markup: A PageElement or a list of them.
+ """
+ # print('looking for %s in %s' % (self, markup))
found = None
# If given a list of items, scan it for a text element that
# matches.
@@ -1649,49 +2321,44 @@ class SoupStrainer(object):
# If it's a Tag, make sure its name or attributes match.
# Don't bother with Tags if we're searching for text.
elif isinstance(markup, Tag):
- if not self.text or self.name or self.attrs:
+ if not self.string or self.name or self.attrs:
found = self.search_tag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
isinstance(markup, str):
- if not self.name and not self.attrs and self._matches(markup, self.text):
+ if not self.name and not self.attrs and self._matches(markup, self.string):
found = markup
else:
raise Exception(
"I don't know how to match against a %s" % markup.__class__)
return found
- def _matches(self, markup, match_against):
- # print u"Matching %s against %s" % (markup, match_against)
+ def _matches(self, markup, match_against, already_tried=None):
+ # print(u"Matching %s against %s" % (markup, match_against))
result = False
if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute
# like 'class'.
- if (isinstance(match_against, str)
- and ' ' in match_against):
- # A bit of a special case. If they try to match "foo
- # bar" on a multivalue attribute's value, only accept
- # the literal value "foo bar"
- #
- # XXX This is going to be pretty slow because we keep
- # splitting match_against. But it shouldn't come up
- # too often.
- return (whitespace_re.split(match_against) == markup)
- else:
- for item in markup:
- if self._matches(item, match_against):
- return True
- return False
+ for item in markup:
+ if self._matches(item, match_against):
+ return True
+ # We didn't match any particular value of the multivalue
+ # attribute, but maybe we match the attribute value when
+ # considered as a string.
+ if self._matches(' '.join(markup), match_against):
+ return True
+ return False
if match_against is True:
# True matches any non-None value.
return markup is not None
- if isinstance(match_against, collections.abc.Callable):
+ if isinstance(match_against, Callable):
return match_against(markup)
# Custom callables take the tag as an argument, but all
# other ways of matching match the tag name as a string.
+ original_markup = markup
if isinstance(markup, Tag):
markup = markup.name
@@ -1702,23 +2369,67 @@ class SoupStrainer(object):
# None matches None, False, an empty string, an empty list, and so on.
return not match_against
- if isinstance(match_against, str):
+ if (hasattr(match_against, '__iter__')
+ and not isinstance(match_against, str)):
+ # We're asked to match against an iterable of items.
+ # The markup must be match at least one item in the
+ # iterable. We'll try each one in turn.
+ #
+ # To avoid infinite recursion we need to keep track of
+ # items we've already seen.
+ if not already_tried:
+ already_tried = set()
+ for item in match_against:
+ if item.__hash__:
+ key = item
+ else:
+ key = id(item)
+ if key in already_tried:
+ continue
+ else:
+ already_tried.add(key)
+ if self._matches(original_markup, item, already_tried):
+ return True
+ else:
+ return False
+
+ # Beyond this point we might need to run the test twice: once against
+ # the tag's name and once against its prefixed name.
+ match = False
+
+ if not match and isinstance(match_against, str):
# Exact string match
- return markup == match_against
+ match = markup == match_against
- if hasattr(match_against, 'match'):
+ if not match and hasattr(match_against, 'search'):
# Regexp match
return match_against.search(markup)
- if hasattr(match_against, '__iter__'):
- # The markup must be an exact match against something
- # in the iterable.
- return markup in match_against
+ if (not match
+ and isinstance(original_markup, Tag)
+ and original_markup.prefix):
+ # Try the whole thing again with the prefixed tag name.
+ return self._matches(
+ original_markup.prefix + ':' + original_markup.name, match_against
+ )
+
+ return match
class ResultSet(list):
"""A ResultSet is just a list that keeps track of the SoupStrainer
that created it."""
def __init__(self, source, result=()):
+ """Constructor.
+
+ :param source: A SoupStrainer.
+ :param result: A list of PageElements.
+ """
super(ResultSet, self).__init__(result)
self.source = source
+
+ def __getattr__(self, key):
+ """Raise a helpful exception to explain a common code fix."""
+ raise AttributeError(
+ "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
+ )