diff options
Diffstat (limited to 'bitbake/lib/bs4/element.py')
-rw-r--r-- | bitbake/lib/bs4/element.py | 2219 |
1 files changed, 1465 insertions, 754 deletions
diff --git a/bitbake/lib/bs4/element.py b/bitbake/lib/bs4/element.py index 68be42d138..0aefe734b2 100644 --- a/bitbake/lib/bs4/element.py +++ b/bitbake/lib/bs4/element.py @@ -1,14 +1,27 @@ +# Use of this source code is governed by the MIT license. __license__ = "MIT" -import collections.abc +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable import re import sys import warnings -from bs4.dammit import EntitySubstitution + +from bs4.css import CSS +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) DEFAULT_OUTPUT_ENCODING = "utf-8" -PY3K = (sys.version_info[0] > 2) +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. whitespace_re = re.compile(r"\s+") def _alias(attr): @@ -23,12 +36,49 @@ def _alias(attr): return alias +# These encodings are recognized by Python (so PageElement.encode +# could theoretically support them) but XML and HTML don't recognize +# them (so they should not show up in an XML or HTML document as that +# document's encoding). +# +# If an XML document is encoded in one of these encodings, no encoding +# will be mentioned in the XML declaration. If an HTML document is +# encoded in one of these encodings, and the HTML document has a +# <meta> tag that mentions an encoding, the encoding will be given as +# the empty string. +# +# Source: +# https://docs.python.org/3/library/codecs.html#python-specific-encodings +PYTHON_SPECIFIC_ENCODINGS = set([ + "idna", + "mbcs", + "oem", + "palmos", + "punycode", + "raw_unicode_escape", + "undefined", + "unicode_escape", + "raw-unicode-escape", + "unicode-escape", + "string-escape", + "string_escape", +]) + + class NamespacedAttribute(str): + """A namespaced string (e.g. 'xml:lang') that remembers the namespace + ('xml') and the name ('lang') that were used to create it. + """ - def __new__(cls, prefix, name, namespace=None): - if name is None: + def __new__(cls, prefix, name=None, namespace=None): + if not name: + # This is the default namespace. Its name "has no value" + # per https://www.w3.org/TR/xml-names/#defaulting + name = None + + if not name: obj = str.__new__(cls, prefix) - elif prefix is None: + elif not prefix: # Not really namespaced. obj = str.__new__(cls, name) else: @@ -54,6 +104,11 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + """When an HTML document is being encoded to a given encoding, the + value of a meta tag's 'charset' is the name of the encoding. + """ + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' return encoding @@ -79,118 +134,44 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return obj def encode(self, encoding): + if encoding in PYTHON_SPECIFIC_ENCODINGS: + return '' def rewrite(match): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) -class HTMLAwareEntitySubstitution(EntitySubstitution): - - """Entity substitution rules that are aware of some HTML quirks. - Specifically, the contents of <script> and <style> tags should not - undergo entity substitution. +class PageElement(object): + """Contains the navigational information for some part of the page: + that is, its current location in the parse tree. - Incoming NavigableString objects are checked to see if they're the - direct children of a <script> or <style> tag. + NavigableString, Tag, etc. are all subclasses of PageElement. """ - cdata_containing_tags = set(["script", "style"]) + # In general, we can't tell just by looking at an element whether + # it's contained in an XML document or an HTML document. But for + # Tags (q.v.) we can store this information at parse time. + known_xml = None - preformatted_tags = set(["pre"]) - - @classmethod - def _substitute_if_appropriate(cls, ns, f): - if (isinstance(ns, NavigableString) - and ns.parent is not None - and ns.parent.name in cls.cdata_containing_tags): - # Do nothing. - return ns - # Substitute. - return f(ns) + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements. - @classmethod - def substitute_html(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_html) + :param parent: The parent of this element. - @classmethod - def substitute_xml(cls, ns): - return cls._substitute_if_appropriate( - ns, EntitySubstitution.substitute_xml) + :param previous_element: The element parsed immediately before + this one. -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - # There are five possible values for the "formatter" argument passed in - # to methods like encode() and prettify(): - # - # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to - # XML entities: & < > - # None - The null formatter. Unicode characters are never - # converted to entities. This is not recommended, but it's - # faster than "minimal". - # A function - This function will be called on every string that - # needs to undergo entity substitution. - # - - # In an HTML document, the default "html" and "minimal" functions - # will leave the contents of <script> and <style> tags alone. For - # an XML document, all tags will be given the same treatment. - - HTML_FORMATTERS = { - "html" : HTMLAwareEntitySubstitution.substitute_html, - "minimal" : HTMLAwareEntitySubstitution.substitute_xml, - None : None - } - - XML_FORMATTERS = { - "html" : EntitySubstitution.substitute_html, - "minimal" : EntitySubstitution.substitute_xml, - None : None - } - - def format_string(self, s, formatter='minimal'): - """Format the given string using the given formatter.""" - if not isinstance(formatter, collections.abc.Callable): - formatter = self._formatter_for_name(formatter) - if formatter is None: - output = s - else: - output = formatter(s) - return output + :param next_element: The element parsed immediately before + this one. - @property - def _is_xml(self): - """Is this element part of an XML tree or an HTML tree? + :param previous_sibling: The most recently encountered element + on the same level of the parse tree as this one. - This is used when mapping a formatter name ("minimal") to an - appropriate function (one that performs entity-substitution on - the contents of <script> and <style> tags, or not). It's - inefficient, but it should be called very rarely. + :param previous_sibling: The next element to be encountered + on the same level of the parse tree as this one. """ - if self.parent is None: - # This is the top-level object. It should have .is_xml set - # from tree creation. If not, take a guess--BS is usually - # used on HTML markup. - return getattr(self, 'is_xml', False) - return self.parent._is_xml - - def _formatter_for_name(self, name): - "Look up a formatter function based on its name and the tree." - if self._is_xml: - return self.XML_FORMATTERS.get( - name, EntitySubstitution.substitute_xml) - else: - return self.HTML_FORMATTERS.get( - name, HTMLAwareEntitySubstitution.substitute_xml) - - def setup(self, parent=None, previous_element=None, next_element=None, - previous_sibling=None, next_sibling=None): - """Sets up the initial relations between this element and - other elements.""" self.parent = parent self.previous_element = previous_element @@ -198,48 +179,156 @@ class PageElement(object): self.previous_element.next_element = self self.next_element = next_element - if self.next_element: + if self.next_element is not None: self.next_element.previous_element = self self.next_sibling = next_sibling - if self.next_sibling: + if self.next_sibling is not None: self.next_sibling.previous_sibling = self - if (not previous_sibling + if (previous_sibling is None and self.parent is not None and self.parent.contents): previous_sibling = self.parent.contents[-1] self.previous_sibling = previous_sibling - if previous_sibling: + if previous_sibling is not None: self.previous_sibling.next_sibling = self + def format_string(self, s, formatter): + """Format the given string using the given formatter. + + :param s: A string. + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look + up an XMLFormatter or HTMLFormatter in the appropriate + registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if isinstance(formatter, Callable): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 - def replace_with(self, replace_with): - if not self.parent: + default = object() + def _all_strings(self, strip=False, types=default): + """Yield all strings of certain classes, possibly stripping them. + + This is implemented differently in Tag and NavigableString. + """ + raise NotImplementedError() + + @property + def stripped_strings(self): + """Yield all strings in this PageElement, stripping them first. + + :yield: A sequence of stripped strings. + """ + for string in self._all_strings(True): + yield string + + def get_text(self, separator="", strip=False, + types=default): + """Get all child strings of this PageElement, concatenated using the + given separator. + + :param separator: Strings will be concatenated using this separator. + + :param strip: If True, strings will be stripped before being + concatenated. + + :param types: A tuple of NavigableString subclasses. Any + strings of a subclass not found in this list will be + ignored. Although there are exceptions, the default + behavior in most cases is to consider only NavigableString + and CData objects. That means no comments, processing + instructions, etc. + + :return: A string. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def replace_with(self, *args): + """Replace this PageElement with one or more PageElements, keeping the + rest of the tree the same. + + :param args: One or more PageElements. + :return: `self`, no longer part of the tree. + """ + if self.parent is None: raise ValueError( - "Cannot replace one element with another when the" + "Cannot replace one element with another when the " "element to be replaced is not part of a tree.") - if replace_with is self: + if len(args) == 1 and args[0] is self: return - if replace_with is self.parent: + if any(x is self.parent for x in args): raise ValueError("Cannot replace a Tag with its parent.") old_parent = self.parent my_index = self.parent.index(self) - self.extract() - old_parent.insert(my_index, replace_with) + self.extract(_self_index=my_index) + for idx, replace_with in enumerate(args, start=my_index): + old_parent.insert(idx, replace_with) return self replaceWith = replace_with # BS3 def unwrap(self): + """Replace this PageElement with its contents. + + :return: `self`, no longer part of the tree. + """ my_parent = self.parent - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace an element with its contents when that" "element is not part of a tree.") my_index = self.parent.index(self) - self.extract() + self.extract(_self_index=my_index) for child in reversed(self.contents[:]): my_parent.insert(my_index, child) return self @@ -247,14 +336,29 @@ class PageElement(object): replaceWithChildren = unwrap # BS3 def wrap(self, wrap_inside): + """Wrap this PageElement inside another one. + + :param wrap_inside: A PageElement. + :return: `wrap_inside`, occupying the position in the tree that used + to be occupied by `self`, and with `self` inside it. + """ me = self.replace_with(wrap_inside) wrap_inside.append(me) return wrap_inside - def extract(self): - """Destructively rips this element out of the tree.""" + def extract(self, _self_index=None): + """Destructively rips this element out of the tree. + + :param _self_index: The location of this element in its parent's + .contents, if known. Passing this in allows for a performance + optimization. + + :return: `self`, no longer part of the tree. + """ if self.parent is not None: - del self.parent.contents[self.parent.index(self)] + if _self_index is None: + _self_index = self.parent.index(self) + del self.parent.contents[_self_index] #Find the two elements that would be next to each other if #this element (and any children) hadn't been parsed. Connect @@ -281,8 +385,13 @@ class PageElement(object): return self def _last_descendant(self, is_initialized=True, accept_self=True): - "Finds the last element beneath this object to be parsed." - if is_initialized and self.next_sibling: + """Finds the last element beneath this object to be parsed. + + :param is_initialized: Has `setup` been called on this PageElement + yet? + :param accept_self: Is `self` an acceptable answer to the question? + """ + if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: last_child = self @@ -295,6 +404,14 @@ class PageElement(object): _lastRecursiveChild = _last_descendant def insert(self, position, new_child): + """Insert a new PageElement in the list of this PageElement's children. + + This works the same way as `list.insert`. + + :param position: The numeric position that should be occupied + in `self.children` by the new PageElement. + :param new_child: A PageElement. + """ if new_child is None: raise ValueError("Cannot insert None into a tag.") if new_child is self: @@ -303,6 +420,14 @@ class PageElement(object): and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return position = min(position, len(self.contents)) if hasattr(new_child, 'parent') and new_child.parent is not None: # We're 'inserting' an element that's already one @@ -361,160 +486,326 @@ class PageElement(object): self.contents.insert(position, new_child) def append(self, tag): - """Appends the given tag to the contents of this tag.""" + """Appends the given PageElement to the contents of this one. + + :param tag: A PageElement. + """ self.insert(len(self.contents), tag) - def insert_before(self, predecessor): - """Makes the given element the immediate predecessor of this one. + def extend(self, tags): + """Appends the given PageElements to this one's contents. - The two elements will have the same parent, and the given element + :param tags: A list of PageElements. If a single Tag is + provided instead, this PageElement's contents will be extended + with that Tag's contents. + """ + if isinstance(tags, Tag): + tags = tags.contents + if isinstance(tags, list): + # Moving items around the tree may change their position in + # the original list. Make a list that won't change. + tags = list(tags) + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + All the elements will have the same parent, and the given elements will be immediately before this one. + + :param args: One or more PageElements. """ - if self is predecessor: - raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(predecessor, PageElement): - predecessor.extract() - index = parent.index(self) - parent.insert(index, predecessor) - - def insert_after(self, successor): - """Makes the given element the immediate successor of this one. - - The two elements will have the same parent, and the given element + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements will be immediately after this one. + + :param args: One or more PageElements. """ - if self is successor: - raise ValueError("Can't insert an element after itself.") + # Do all error checking before modifying the tree. parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(successor, PageElement): - successor.extract() - index = parent.index(self) - parent.insert(index+1, successor) - - def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 + + def find_next(self, name=None, attrs={}, string=None, **kwargs): + """Find the first PageElement that matches the given criteria and + appears later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_all_next, name, attrs, string, **kwargs) findNext = find_next # BS3 - def find_all_next(self, name=None, attrs={}, text=None, limit=None, + def find_all_next(self, name=None, attrs={}, string=None, limit=None, **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" - return self._find_all(name, attrs, text, limit, self.next_elements, - **kwargs) + """Find all PageElements that match the given criteria and appear + later in the document than this PageElement. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet containing PageElements. + """ + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all(name, attrs, string, limit, self.next_elements, + _stacklevel=_stacklevel+1, **kwargs) findAllNext = find_all_next # BS3 - def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._find_one(self.find_next_siblings, name, attrs, text, + def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): + """Find the closest sibling to this PageElement that matches the + given criteria and appears later in the document. + + All find_* methods take a common set of arguments. See the + online documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_next_siblings, name, attrs, string, **kwargs) findNextSibling = find_next_sibling # BS3 - def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + def find_next_siblings(self, name=None, attrs={}, string=None, limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._find_all(name, attrs, text, limit, - self.next_siblings, **kwargs) + """Find all siblings of this PageElement that match the given criteria + and appear later in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, + self.next_siblings, _stacklevel=_stacklevel+1, **kwargs + ) findNextSiblings = find_next_siblings # BS3 fetchNextSiblings = find_next_siblings # BS2 - def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" + def find_previous(self, name=None, attrs={}, string=None, **kwargs): + """Look backwards in the document from this PageElement and find the + first PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self._find_one( - self.find_all_previous, name, attrs, text, **kwargs) + self.find_all_previous, name, attrs, string, **kwargs) findPrevious = find_previous # BS3 - def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + def find_all_previous(self, name=None, attrs={}, string=None, limit=None, **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._find_all(name, attrs, text, limit, self.previous_elements, - **kwargs) + """Look backwards in the document from this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, self.previous_elements, + _stacklevel=_stacklevel+1, **kwargs + ) findAllPrevious = find_all_previous # BS3 fetchPrevious = find_all_previous # BS2 - def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._find_one(self.find_previous_siblings, name, attrs, text, + def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs): + """Returns the closest sibling to this PageElement that matches the + given criteria and appears earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + return self._find_one(self.find_previous_siblings, name, attrs, string, **kwargs) findPreviousSibling = find_previous_sibling # BS3 - def find_previous_siblings(self, name=None, attrs={}, text=None, + def find_previous_siblings(self, name=None, attrs={}, string=None, limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._find_all(name, attrs, text, limit, - self.previous_siblings, **kwargs) + """Returns all siblings to this PageElement that match the + given criteria and appear earlier in the document. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all( + name, attrs, string, limit, + self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs + ) findPreviousSiblings = find_previous_siblings # BS3 fetchPreviousSiblings = find_previous_siblings # BS2 def find_parent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" + """Find the closest parent of this PageElement that matches the given + criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ # NOTE: We can't use _find_one because findParents takes a different # set of arguments. r = None - l = self.find_parents(name, attrs, 1, **kwargs) + l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) if l: r = l[0] return r findParent = find_parent # BS3 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" + """Find all parents of this PageElement that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ + _stacklevel = kwargs.pop('_stacklevel', 2) return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) + _stacklevel=_stacklevel+1, **kwargs) findParents = find_parents # BS3 fetchParents = find_parents # BS2 @property def next(self): + """The PageElement, if any, that was parsed just after this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self.next_element @property def previous(self): + """The PageElement, if any, that was parsed just before this one. + + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ return self.previous_element #These methods do the real heavy lifting. - def _find_one(self, method, name, attrs, text, **kwargs): + def _find_one(self, method, name, attrs, string, **kwargs): r = None - l = method(name, attrs, text, 1, **kwargs) + l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) if l: r = l[0] return r - def _find_all(self, name, attrs, text, limit, generator, **kwargs): + def _find_all(self, name, attrs, string, limit, generator, **kwargs): "Iterates over a generator looking for things that match." + _stacklevel = kwargs.pop('_stacklevel', 3) - if text is None and 'string' in kwargs: - text = kwargs['string'] - del kwargs['string'] + if string is None and 'text' in kwargs: + string = kwargs.pop('text') + warnings.warn( + "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", + DeprecationWarning, stacklevel=_stacklevel + ) if isinstance(name, SoupStrainer): strainer = name else: - strainer = SoupStrainer(name, attrs, text, **kwargs) + strainer = SoupStrainer(name, attrs, string, **kwargs) - if text is None and not limit and not attrs and not kwargs: + if string is None and not limit and not attrs and not kwargs: if name is True or name is None: # Optimization to find all tags. result = (element for element in generator @@ -522,9 +813,23 @@ class PageElement(object): return ResultSet(strainer, result) elif isinstance(name, str): # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) + else: + prefix = None + local_name = name result = (element for element in generator if isinstance(element, Tag) - and element.name == name) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) + ) return ResultSet(strainer, result) results = ResultSet(strainer) while True: @@ -544,6 +849,10 @@ class PageElement(object): #NavigableStrings and Tags. @property def next_elements(self): + """All PageElements that were parsed after this one. + + :yield: A sequence of PageElements. + """ i = self.next_element while i is not None: yield i @@ -551,6 +860,11 @@ class PageElement(object): @property def next_siblings(self): + """All PageElements that are siblings of this one but were parsed + later. + + :yield: A sequence of PageElements. + """ i = self.next_sibling while i is not None: yield i @@ -558,6 +872,10 @@ class PageElement(object): @property def previous_elements(self): + """All PageElements that were parsed before this one. + + :yield: A sequence of PageElements. + """ i = self.previous_element while i is not None: yield i @@ -565,6 +883,11 @@ class PageElement(object): @property def previous_siblings(self): + """All PageElements that are siblings of this one but were parsed + earlier. + + :yield: A sequence of PageElements. + """ i = self.previous_sibling while i is not None: yield i @@ -572,87 +895,23 @@ class PageElement(object): @property def parents(self): + """All PageElements that are parents of this PageElement. + + :yield: A sequence of PageElements. + """ i = self.parent while i is not None: yield i i = i.parent - # Methods for supporting CSS selectors. - - tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') - - # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ - # \---------------------------/ \---/\-------------/ \-------/ - # | | | | - # | | | The value - # | | ~,|,^,$,* or = - # | Attribute - # Tag - attribselect_re = re.compile( - r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + - r'=?"?(?P<value>[^\]"]*)"?\]$' - ) - - def _attr_value_as_string(self, value, default=None): - """Force an attribute value into a string representation. + @property + def decomposed(self): + """Check whether a PageElement has been decomposed. - A multi-valued attribute will be converted into a - space-separated stirng. + :rtype: bool """ - value = self.get(value, default) - if isinstance(value, list) or isinstance(value, tuple): - value =" ".join(value) - return value - - def _tag_name_matches_and(self, function, tag_name): - if not tag_name: - return function - else: - def _match(tag): - return tag.name == tag_name and function(tag) - return _match - - def _attribute_checker(self, operator, attribute, value=''): - """Create a function that performs a CSS selector operation. - - Takes an operator, attribute and optional value. Returns a - function that will return True for elements that match that - combination. - """ - if operator == '=': - # string representation of `attribute` is equal to `value` - return lambda el: el._attr_value_as_string(attribute) == value - elif operator == '~': - # space-separated list representation of `attribute` - # contains `value` - def _includes_value(element): - attribute_value = element.get(attribute, []) - if not isinstance(attribute_value, list): - attribute_value = attribute_value.split() - return value in attribute_value - return _includes_value - elif operator == '^': - # string representation of `attribute` starts with `value` - return lambda el: el._attr_value_as_string( - attribute, '').startswith(value) - elif operator == '$': - # string represenation of `attribute` ends with `value` - return lambda el: el._attr_value_as_string( - attribute, '').endswith(value) - elif operator == '*': - # string representation of `attribute` contains `value` - return lambda el: value in el._attr_value_as_string(attribute, '') - elif operator == '|': - # string representation of `attribute` is either exactly - # `value` or starts with `value` and then a dash. - def _is_or_starts_with_dash(element): - attribute_value = element._attr_value_as_string(attribute, '') - return (attribute_value == value or attribute_value.startswith( - value + '-')) - return _is_or_starts_with_dash - else: - return lambda el: el.has_attr(attribute) - + return getattr(self, '_decomposed', False) or False + # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -672,6 +931,11 @@ class PageElement(object): class NavigableString(str, PageElement): + """A Python Unicode string that is part of a parse tree. + + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a NavigableString for the string "penguin". + """ PREFIX = '' SUFFIX = '' @@ -691,12 +955,22 @@ class NavigableString(str, PageElement): u.setup() return u - def __copy__(self): + def __deepcopy__(self, memo, recursive=False): """A copy of a NavigableString has the same contents and class as the original, but it is not connected to the parse tree. + + :param recursive: This parameter is ignored; it's only defined + so that NavigableString.__deepcopy__ implements the same + signature as Tag.__deepcopy__. """ return type(self)(self) + def __copy__(self): + """A copy of a NavigableString can only be a deep copy, because + only one PageElement can occupy a given place in a parse tree. + """ + return self.__deepcopy__({}) + def __getnewargs__(self): return (str(self),) @@ -712,55 +986,146 @@ class NavigableString(str, PageElement): self.__class__.__name__, attr)) def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter. + + :param formatter: A Formatter object, or a string naming one of the standard formatters. + """ output = self.format_string(self, formatter) return self.PREFIX + output + self.SUFFIX @property def name(self): + """Since a NavigableString is not a Tag, it has no .name. + + This property is implemented so that code like this doesn't crash + when run on a mixture of Tag and NavigableString objects: + [x.name for x in tag.children] + """ return None @name.setter def name(self, name): + """Prevent NavigableString.name from ever being set.""" raise AttributeError("A NavigableString cannot be given a name.") + def _all_strings(self, strip=False, types=PageElement.default): + """Yield all strings of certain classes, possibly stripping them. + + This makes it easy for NavigableString to implement methods + like get_text() as conveniences, creating a consistent + text-extraction API across all PageElements. + + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. If this + NavigableString isn't one of those subclasses, the + sequence will be empty. By default, the subclasses + considered are NavigableString and CData objects. That + means no comments, processing instructions, etc. + + :yield: A sequence that either contains this string, or is empty. + + """ + if types is self.default: + # This is kept in Tag because it's full of subclasses of + # this class, which aren't defined until later in the file. + types = Tag.DEFAULT_INTERESTING_STRING_TYPES + + # Do nothing if the caller is looking for specific types of + # string, and we're of a different type. + # + # We check specific types instead of using isinstance(self, + # types) because all of these classes subclass + # NavigableString. Anyone who's using this feature probably + # wants generic NavigableStrings but not other stuff. + my_type = type(self) + if types is not None: + if isinstance(types, type): + # Looking for a single type. + if my_type is not types: + return + elif my_type not in types: + # Looking for one of a list of types. + return + + value = self + if strip: + value = value.strip() + if len(value) > 0: + yield value + strings = property(_all_strings) + class PreformattedString(NavigableString): """A NavigableString not subject to the normal formatting rules. - The string will be passed into the formatter (to trigger side effects), - but the return value will be ignored. + This is an abstract class used for special kinds of strings such + as comments (the Comment class) and CDATA blocks (the CData + class). """ - def output_ready(self, formatter="minimal"): - """CData strings are passed into the formatter. - But the return value is ignored.""" - self.format_string(self, formatter) + PREFIX = '' + SUFFIX = '' + + def output_ready(self, formatter=None): + """Make this string ready for output by adding any subclass-specific + prefix or suffix. + + :param formatter: A Formatter object, or a string naming one + of the standard formatters. The string will be passed into the + Formatter, but only to trigger any side effects: the return + value is ignored. + + :return: The string, with any subclass-specific prefix and + suffix added on. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) return self.PREFIX + self + self.SUFFIX class CData(PreformattedString): - + """A CDATA block.""" PREFIX = '<![CDATA[' SUFFIX = ']]>' class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" PREFIX = '<?' SUFFIX = '>' -class Comment(PreformattedString): +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = '<?' + SUFFIX = '?>' +class Comment(PreformattedString): + """An HTML or XML comment.""" PREFIX = '<!--' SUFFIX = '-->' class Declaration(PreformattedString): + """An XML declaration.""" PREFIX = '<?' SUFFIX = '?>' class Doctype(PreformattedString): - + """A document type declaration.""" @classmethod def for_name_and_ids(cls, name, pub_id, system_id): + """Generate an appropriate document type declaration for a given + public ID and system ID. + + :param name: The name of the document's root element, e.g. 'html'. + :param pub_id: The Formal Public Identifier for this document type, + e.g. '-//W3C//DTD XHTML 1.1//EN' + :param system_id: The system identifier for this document type, + e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' + + :return: A Doctype. + """ value = name or '' if pub_id is not None: value += ' PUBLIC "%s"' % pub_id @@ -775,14 +1140,105 @@ class Doctype(PreformattedString): SUFFIX = '>\n' +class Stylesheet(NavigableString): + """A NavigableString representing an stylesheet (probably + CSS). + + Used to distinguish embedded stylesheets from textual content. + """ + pass + + +class Script(NavigableString): + """A NavigableString representing an executable script (probably + Javascript). + + Used to distinguish executable code from textual content. + """ + pass + + +class TemplateString(NavigableString): + """A NavigableString representing a string found inside an HTML + template embedded in a larger document. + + Used to distinguish such strings from the main body of the document. + """ + pass + + +class RubyTextString(NavigableString): + """A NavigableString representing the contents of the <rt> HTML + element. + + https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element + + Can be used to distinguish such strings from the strings they're + annotating. + """ + pass + + +class RubyParenthesisString(NavigableString): + """A NavigableString representing the contents of the <rp> HTML + element. + + https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element + """ + pass + + class Tag(PageElement): + """Represents an HTML or XML tag that is part of a parse tree, along + with its attributes and contents. - """Represents a found HTML tag with its attributes and contents.""" + When Beautiful Soup parses the markup <b>penguin</b>, it will + create a Tag object representing the <b> tag. + """ def __init__(self, parser=None, builder=None, name=None, namespace=None, - prefix=None, attrs=None, parent=None, previous=None): - "Basic constructor." - + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None, sourceline=None, sourcepos=None, + can_be_empty_element=None, cdata_list_attributes=None, + preserve_whitespace_tags=None, + interesting_string_types=None, + namespaces=None + ): + """Basic constructor. + + :param parser: A BeautifulSoup object. + :param builder: A TreeBuilder. + :param name: The name of the tag. + :param namespace: The URI of this Tag's XML namespace, if any. + :param prefix: The prefix for this Tag's XML namespace, if any. + :param attrs: A dictionary of this Tag's attribute values. + :param parent: The PageElement to use as this Tag's parent. + :param previous: The PageElement that was parsed immediately before + this tag. + :param is_xml: If True, this is an XML tag. Otherwise, this is an + HTML tag. + :param sourceline: The line number where this tag was found in its + source document. + :param sourcepos: The character position within `sourceline` where this + tag was found. + :param can_be_empty_element: If True, this tag should be + represented as <tag/>. If False, this tag should be represented + as <tag></tag>. + :param cdata_list_attributes: A list of attributes whose values should + be treated as CDATA if they ever show up on this tag. + :param preserve_whitespace_tags: A list of tag names whose contents + should have their whitespace preserved. + :param interesting_string_types: This is a NavigableString + subclass or a tuple of them. When iterating over this + Tag's strings in methods like Tag.strings or Tag.get_text, + these are the types of strings that are interesting enough + to be considered. The default is to consider + NavigableString and CData the only interesting string + subtypes. + :param namespaces: A dictionary mapping currently active + namespace prefixes to URIs. This can be used later to + construct CSS selectors. + """ if parser is None: self.parser_class = None else: @@ -793,7 +1249,12 @@ class Tag(PageElement): raise ValueError("No value provided for new tag's name.") self.name = name self.namespace = namespace + self._namespaces = namespaces or {} self.prefix = prefix + if ((not builder or builder.store_line_numbers) + and (sourceline is not None or sourcepos is not None)): + self.sourceline = sourceline + self.sourcepos = sourcepos if attrs is None: attrs = {} elif attrs: @@ -804,32 +1265,109 @@ class Tag(PageElement): attrs = dict(attrs) else: attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml self.attrs = attrs self.contents = [] self.setup(parent, previous) self.hidden = False - # Set up any substitutions, such as the charset in a META tag. - if builder is not None: + if builder is None: + # In the absence of a TreeBuilder, use whatever values were + # passed in here. They're probably None, unless this is a copy of some + # other tag. + self.can_be_empty_element = can_be_empty_element + self.cdata_list_attributes = cdata_list_attributes + self.preserve_whitespace_tags = preserve_whitespace_tags + self.interesting_string_types = interesting_string_types + else: + # Set up any substitutions for this tag, such as the charset in a META tag. builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. self.can_be_empty_element = builder.can_be_empty_element(name) - else: - self.can_be_empty_element = False + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + if self.name in builder.string_containers: + # This sort of tag uses a special string container + # subclass for most of its strings. When we ask the + self.interesting_string_types = builder.string_containers[self.name] + else: + self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES parserClass = _alias("parser_class") # BS3 - def __copy__(self): - """A copy of a Tag is a new Tag, unconnected to the parse tree. + def __deepcopy__(self, memo, recursive=True): + """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. """ - clone = type(self)(None, self.builder, self.name, self.namespace, - self.nsprefix, self.attrs) + clone = self._clone() + + if recursive: + # Clone this tag's descendants recursively, but without + # making any recursive function calls. + tag_stack = [clone] + for event, element in self._event_stream(self.descendants): + if event is Tag.END_ELEMENT_EVENT: + # Stop appending incoming Tags to the Tag that was + # just closed. + tag_stack.pop() + else: + descendant_clone = element.__deepcopy__( + memo, recursive=False + ) + # Add to its parent's .contents + tag_stack[-1].append(descendant_clone) + + if event is Tag.START_ELEMENT_EVENT: + # Add the Tag itself to the stack so that its + # children will be .appended to it. + tag_stack.append(descendant_clone) + return clone + + def __copy__(self): + """A copy of a Tag must always be a deep copy, because a Tag's + children can only have one parent at a time. + """ + return self.__deepcopy__({}) + + def _clone(self): + """Create a new Tag just like this one, but with no + contents and unattached to any parse tree. + + This is the first step in the deepcopy process. + """ + clone = type(self)( + None, None, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml, + sourceline=self.sourceline, sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags, + interesting_string_types=self.interesting_string_types + ) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) - for child in self.contents: - clone.append(child.__copy__()) return clone - + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) @@ -850,13 +1388,17 @@ class Tag(PageElement): @property def string(self): - """Convenience property to get the single string within this tag. + """Convenience property to get the single string within this + PageElement. - :Return: If this tag has a single string child, return value - is that string. If this tag has no children, or more than one - child, return value is None. If this tag has one child tag, + TODO It might make sense to have NavigableString.string return + itself. + + :return: If this element has a single string child, return + value is that string. If this element has one child tag, return value is the 'string' attribute of the child tag, - recursively. + recursively. If this element is itself a string, has no + children, or has more than one child, return value is None. """ if len(self.contents) != 1: return None @@ -867,57 +1409,75 @@ class Tag(PageElement): @string.setter def string(self, string): + """Replace this PageElement's contents with `string`.""" self.clear() self.append(string.__class__(string)) - def _all_strings(self, strip=False, types=(NavigableString, CData)): + DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) + def _all_strings(self, strip=False, types=PageElement.default): """Yield all strings of certain classes, possibly stripping them. - By default, yields only NavigableString and CData objects. So - no comments, processing instructions, etc. + :param strip: If True, all strings will be stripped before being + yielded. + + :param types: A tuple of NavigableString subclasses. Any strings of + a subclass not found in this list will be ignored. By + default, the subclasses considered are the ones found in + self.interesting_string_types. If that's not specified, + only NavigableString and CData objects will be + considered. That means no comments, processing + instructions, etc. + + :yield: A sequence of strings. + """ + if types is self.default: + types = self.interesting_string_types + for descendant in self.descendants: - if ( - (types is None and not isinstance(descendant, NavigableString)) - or - (types is not None and type(descendant) not in types)): + if (types is None and not isinstance(descendant, NavigableString)): + continue + descendant_type = type(descendant) + if isinstance(types, type): + if descendant_type is not types: + # We're not interested in strings of this type. + continue + elif types is not None and descendant_type not in types: + # We're not interested in strings of this type. continue if strip: descendant = descendant.strip() if len(descendant) == 0: continue yield descendant - strings = property(_all_strings) - @property - def stripped_strings(self): - for string in self._all_strings(True): - yield string + def decompose(self): + """Recursively destroys this PageElement and its children. - def get_text(self, separator="", strip=False, - types=(NavigableString, CData)): - """ - Get all child strings, concatenated using the given separator. - """ - return separator.join([s for s in self._all_strings( - strip, types=types)]) - getText = get_text - text = property(get_text) + This element will be removed from the tree and wiped out; so + will everything beneath it. - def decompose(self): - """Recursively destroys the contents of this tree.""" + The behavior of a decomposed PageElement is undefined and you + should never use one for anything, but if you need to _check_ + whether an element has been decomposed, you can use the + `decomposed` property. + """ self.extract() i = self while i is not None: - next = i.next_element + n = i.next_element i.__dict__.clear() i.contents = [] - i = next + i._decomposed = True + i = n def clear(self, decompose=False): - """ - Extract all children. If decompose is True, decompose instead. + """Wipe out all children of this PageElement by calling extract() + on them. + + :param decompose: If this is True, decompose() (a more + destructive method) will be called instead of extract(). """ if decompose: for element in self.contents[:]: @@ -929,10 +1489,51 @@ class Tag(PageElement): for element in self.contents[:]: element.extract() - def index(self, element): + def smooth(self): + """Smooth out this element's children by consolidating consecutive + strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. """ - Find the index of a child by identity, not value. Avoids issues with - tag.contents.index(element) getting the index of equal elements. + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + + def index(self, element): + """Find the index of a child by identity, not value. + + Avoids issues with tag.contents.index(element) getting the + index of equal elements. + + :param element: Look for this PageElement in `self.contents`. """ for i, child in enumerate(self.contents): if child is element: @@ -945,23 +1546,38 @@ class Tag(PageElement): attribute.""" return self.attrs.get(key, default) + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list. + + :param key: The attribute to look for. + :param default: Use this value if the attribute is not present + on this PageElement. + :return: A list of values, probably containing only a single + value. + """ + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + def has_attr(self, key): + """Does this PageElement have an attribute with the given name?""" return key in self.attrs def __hash__(self): return str(self).__hash__() def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, + """tag[key] returns the value of the 'key' attribute for the Tag, and throws an exception if it's not there.""" return self.attrs[key] def __iter__(self): - "Iterating over a tag iterates over its contents." + "Iterating over a Tag iterates over its contents." return iter(self.contents) def __len__(self): - "The length of a tag is the length of its list of contents." + "The length of a Tag is the length of its list of contents." return len(self.contents) def __contains__(self, x): @@ -981,29 +1597,33 @@ class Tag(PageElement): self.attrs.pop(key, None) def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its + """Calling a Tag like a function is the same as calling its find_all() method. Eg. tag('a') returns a list of all the A tags found within this tag.""" return self.find_all(*args, **kwargs) def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) + """Calling tag.subtag is the same as calling tag.find(name="subtag")""" + #print("Getattr %s.%s" % (self.__class__, tag)) if len(tag) > 3 and tag.endswith('Tag'): # BS3: soup.aTag -> "soup.find("a") tag_name = tag[:-3] warnings.warn( - '.%sTag is deprecated, use .find("%s") instead.' % ( - tag_name, tag_name)) + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ), + DeprecationWarning, stacklevel=2 + ) return self.find(tag_name) # We special case contents to avoid recursion. - elif not tag.startswith("__") and not tag=="contents": + elif not tag.startswith("__") and not tag == "contents": return self.find(tag) raise AttributeError( "'%s' object has no attribute '%s'" % (self.__class__, tag)) def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag.""" + """Returns true iff this Tag has the same name, the same attributes, + and the same contents (recursively) as `other`.""" if self is other: return True if (not hasattr(other, 'name') or @@ -1019,69 +1639,235 @@ class Tag(PageElement): return True def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, + """Returns true iff this Tag is not identical to `other`, as defined in __eq__.""" return not self == other def __repr__(self, encoding="unicode-escape"): - """Renders this tag as a string.""" - if PY3K: - # "The return value must be a string object", i.e. Unicode - return self.decode() - else: - # "The return value must be a string object", i.e. a bytestring. - # By convention, the return value of __repr__ should also be - # an ASCII string. - return self.encode(encoding) + """Renders this PageElement as a string. - def __unicode__(self): + :param encoding: The encoding to use (Python 2 only). + TODO: This is now ignored and a warning should be issued + if a value is provided. + :return: A (Unicode) string. + """ + # "The return value must be a string object", i.e. Unicode return self.decode() - def __str__(self): - if PY3K: - return self.decode() - else: - return self.encode() + def __unicode__(self): + """Renders this PageElement as a Unicode string.""" + return self.decode() - if PY3K: - __str__ = __repr__ = __unicode__ + __str__ = __repr__ = __unicode__ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, indent_level=None, formatter="minimal", errors="xmlcharrefreplace"): + """Render a bytestring representation of this PageElement and its + contents. + + :param encoding: The destination encoding. + :param indent_level: Each line of the rendering will be + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :param errors: An error handling strategy such as + 'xmlcharrefreplace'. This value is passed along into + encode() and its value should be one of the constants + defined by Python. + :return: A bytestring. + + """ # Turn the data structure into Unicode, then encode the # Unicode. u = self.decode(indent_level, encoding, formatter) return u.encode(encoding, errors) - def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed?""" - return ( - indent_level is not None and - (self.name not in HTMLAwareEntitySubstitution.preformatted_tags - or self._is_xml)) - def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Returns a Unicode representation of this tag and its contents. + formatter="minimal", + iterator=None): + pieces = [] + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + if indent_level is True: + indent_level = 0 + + # The currently active tag that put us into string literal + # mode. Until this element is closed, children will be treated + # as string literals and not pretty-printed. String literal + # mode is turned on immediately after this tag begins, and + # turned off immediately before it's closed. This means there + # will be whitespace before and after the tag itself. + string_literal_tag = None + + for event, element in self._event_stream(iterator): + if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): + piece = element._format_tag( + eventual_encoding, formatter, opening=True + ) + elif event is Tag.END_ELEMENT_EVENT: + piece = element._format_tag( + eventual_encoding, formatter, opening=False + ) + if indent_level is not None: + indent_level -= 1 + else: + piece = element.output_ready(formatter) + + # Now we need to apply the 'prettiness' -- extra + # whitespace before and/or after this tag. This can get + # complicated because certain tags, like <pre> and + # <script>, can't be prettified, since adding whitespace would + # change the meaning of the content. + + # The default behavior is to add whitespace before and + # after an element when string literal mode is off, and to + # leave things as they are when string literal mode is on. + if string_literal_tag: + indent_before = indent_after = False + else: + indent_before = indent_after = True + + # The only time the behavior is more complex than that is + # when we encounter an opening or closing tag that might + # put us into or out of string literal mode. + if (event is Tag.START_ELEMENT_EVENT + and not string_literal_tag + and not element._should_pretty_print()): + # We are about to enter string literal mode. Add + # whitespace before this tag, but not after. We + # will stay in string literal mode until this tag + # is closed. + indent_before = True + indent_after = False + string_literal_tag = element + elif (event is Tag.END_ELEMENT_EVENT + and element is string_literal_tag): + # We are about to exit string literal mode by closing + # the tag that sent us into that mode. Add whitespace + # after this tag, but not before. + indent_before = False + indent_after = True + string_literal_tag = None + + # Now we know whether to add whitespace before and/or + # after this element. + if indent_level is not None: + if (indent_before or indent_after): + if isinstance(element, NavigableString): + piece = piece.strip() + if piece: + piece = self._indent_string( + piece, indent_level, formatter, + indent_before, indent_after + ) + if event == Tag.START_ELEMENT_EVENT: + indent_level += 1 + pieces.append(piece) + return "".join(pieces) + + # Names for the different events yielded by _event_stream + START_ELEMENT_EVENT = object() + END_ELEMENT_EVENT = object() + EMPTY_ELEMENT_EVENT = object() + STRING_ELEMENT_EVENT = object() + + def _event_stream(self, iterator=None): + """Yield a sequence of events that can be used to reconstruct the DOM + for this element. + + This lets us recreate the nested structure of this element + (e.g. when formatting it as a string) without using recursive + method calls. + + This is similar in concept to the SAX API, but it's a simpler + interface designed for internal use. The events are different + from SAX and the arguments associated with the events are Tags + and other Beautiful Soup objects. + + :param iterator: An alternate iterator to use when traversing + the tree. + """ + tag_stack = [] - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. + iterator = iterator or self.self_and_descendants + + for c in iterator: + # If the parent of the element we're about to yield is not + # the tag currently on the stack, it means that the tag on + # the stack closed before this element appeared. + while tag_stack and c.parent != tag_stack[-1]: + now_closed_tag = tag_stack.pop() + yield Tag.END_ELEMENT_EVENT, now_closed_tag + + if isinstance(c, Tag): + if c.is_empty_element: + yield Tag.EMPTY_ELEMENT_EVENT, c + else: + yield Tag.START_ELEMENT_EVENT, c + tag_stack.append(c) + continue + else: + yield Tag.STRING_ELEMENT_EVENT, c + + while tag_stack: + now_closed_tag = tag_stack.pop() + yield Tag.END_ELEMENT_EVENT, now_closed_tag + + def _indent_string(self, s, indent_level, formatter, + indent_before, indent_after): + """Add indentation whitespace before and/or after a string. + + :param s: The string to amend with whitespace. + :param indent_level: The indentation level; affects how much + whitespace goes before the string. + :param indent_before: Whether or not to add whitespace + before the string. + :param indent_after: Whether or not to add whitespace + (a newline) after the string. """ + space_before = '' + if indent_before and indent_level: + space_before = (formatter.indent * indent_level) - # First off, turn a string formatter into a function. This - # will stop the lookup from happening over and over again. - if not isinstance(formatter, collections.abc.Callable): - formatter = self._formatter_for_name(formatter) + space_after = '' + if indent_after: + space_after = "\n" - attrs = [] - if self.attrs: - for key, val in sorted(self.attrs.items()): + return space_before + s + space_after + + def _format_tag(self, eventual_encoding, formatter, opening): + if self.hidden: + # A hidden tag is invisible, although its contents + # are visible. + return '' + + # A tag starts with the < character (see below). + + # Then the / character, if this is a closing tag. + closing_slash = '' + if not opening: + closing_slash = '/' + + # Then an optional namespace prefix. + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + # Then a list of attribute values, if this is an opening tag. + attribute_string = '' + if opening: + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: if val is None: decoded = key else: @@ -1090,71 +1876,52 @@ class Tag(PageElement): elif not isinstance(val, str): val = str(val) elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None): + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): val = val.encode(eventual_encoding) - text = self.format_string(val, formatter) + text = formatter.attribute_value(val) decoded = ( str(key) + '=' - + EntitySubstitution.quoted_attribute_value(text)) + + formatter.quoted_attribute_value(text)) attrs.append(decoded) - close = '' - closeTag = '' - - prefix = '' - if self.prefix: - prefix = self.prefix + ":" + if attrs: + attribute_string = ' ' + ' '.join(attrs) + # Then an optional closing slash (for a void element in an + # XML document). + void_element_closing_slash = '' if self.is_empty_element: - close = '/' - else: - closeTag = '</%s%s>' % (prefix, self.name) - - pretty_print = self._should_pretty_print(indent_level) - space = '' - indent_space = '' - if indent_level is not None: - indent_space = (' ' * (indent_level - 1)) - if pretty_print: - space = indent_space - indent_contents = indent_level + 1 - else: - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, formatter) + void_element_closing_slash = formatter.void_element_close_prefix or '' - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attribute_string = '' - if attrs: - attribute_string = ' ' + ' '.join(attrs) - if indent_level is not None: - # Even if this particular tag is not pretty-printed, - # we should indent up to the start of the tag. - s.append(indent_space) - s.append('<%s%s%s%s>' % ( - prefix, self.name, attribute_string, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if indent_level is not None and closeTag and self.next_sibling: - # Even if this particular tag is not pretty-printed, - # we're now done with the tag, and we should add a - # newline if appropriate. - s.append("\n") - s = ''.join(s) - return s + # Put it all together. + return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' + + def _should_pretty_print(self, indent_level=1): + """Should this tag be pretty-printed? + + Most of them should, but some (such as <pre> in HTML + documents) should not. + """ + return ( + indent_level is not None + and ( + not self.preserve_whitespace_tags + or self.name not in self.preserve_whitespace_tags + ) + ) def prettify(self, encoding=None, formatter="minimal"): + """Pretty-print this PageElement as a string. + + :param encoding: The eventual encoding of the string. If this is None, + a Unicode string will be returned. + :param formatter: A Formatter object, or a string naming one of + the standard formatters. + :return: A Unicode string (if encoding==None) or a bytestring + (otherwise). + """ if encoding is None: return self.decode(True, formatter=formatter) else: @@ -1166,62 +1933,50 @@ class Tag(PageElement): """Renders the contents of this tag as a Unicode string. :param indent_level: Each line of the rendering will be - indented this many spaces. + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ + encoded into this encoding. decode_contents() is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. - :param formatter: The output formatter responsible for converting - entities to Unicode characters. - """ - # First off, turn a string formatter into a function. This - # will stop the lookup from happening over and over again. - if not isinstance(formatter, collections.abc.Callable): - formatter = self._formatter_for_name(formatter) - - pretty_print = (indent_level is not None) - s = [] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.output_ready(formatter) - elif isinstance(c, Tag): - s.append(c.decode(indent_level, eventual_encoding, - formatter)) - if text and indent_level and not self.name == 'pre': - text = text.strip() - if text: - if pretty_print and not self.name == 'pre': - s.append(" " * (indent_level - 1)) - s.append(text) - if pretty_print and not self.name == 'pre': - s.append("\n") - return ''.join(s) + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + + """ + return self.decode(indent_level, eventual_encoding, formatter, + iterator=self.descendants) def encode_contents( self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): - """Renders the contents of this tag as a bytestring. + """Renders the contents of this PageElement as a bytestring. :param indent_level: Each line of the rendering will be - indented this many spaces. + indented this many levels. (The formatter decides what a + 'level' means in terms of spaces or other characters + output.) Used internally in recursive calls while + pretty-printing. :param eventual_encoding: The bytestring will be in this encoding. - :param formatter: The output formatter responsible for converting - entities to Unicode characters. - """ + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + :return: A bytestring. + """ contents = self.decode_contents(indent_level, encoding, formatter) return contents.encode(encoding) # Old method for BS3 compatibility def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, prettyPrint=False, indentLevel=0): + """Deprecated method for BS3 compatibility.""" if not prettyPrint: indentLevel = None return self.encode_contents( @@ -1229,44 +1984,88 @@ class Tag(PageElement): #Soup methods - def find(self, name=None, attrs={}, recursive=True, text=None, + def find(self, name=None, attrs={}, recursive=True, string=None, **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" + """Look in the children of this PageElement and find the first + PageElement that matches the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A PageElement. + :rtype: bs4.element.Tag | bs4.element.NavigableString + """ r = None - l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, + **kwargs) if l: r = l[0] return r - findChild = find + findChild = find #BS2 - def find_all(self, name=None, attrs={}, recursive=True, text=None, + def find_all(self, name=None, attrs={}, recursive=True, string=None, limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - + """Look in the children of this PageElement and find all + PageElements that match the given criteria. + + All find_* methods take a common set of arguments. See the online + documentation for detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param recursive: If this is True, find_all() will perform a + recursive search of this PageElement's children. Otherwise, + only the direct children will be considered. + :param limit: Stop looking after finding this many results. + :kwargs: A dictionary of filters on attribute values. + :return: A ResultSet of PageElements. + :rtype: bs4.element.ResultSet + """ generator = self.descendants if not recursive: generator = self.children - return self._find_all(name, attrs, text, limit, generator, **kwargs) + _stacklevel = kwargs.pop('_stacklevel', 2) + return self._find_all(name, attrs, string, limit, generator, + _stacklevel=_stacklevel+1, **kwargs) findAll = find_all # BS3 findChildren = find_all # BS2 #Generator methods @property def children(self): + """Iterate over all direct children of this PageElement. + + :yield: A sequence of PageElements. + """ # return iter() to make the purpose of the method clear return iter(self.contents) # XXX This seems to be untested. @property + def self_and_descendants(self): + """Iterate over this PageElement and its children in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ + if not self.hidden: + yield self + for i in self.descendants: + yield i + + @property def descendants(self): + """Iterate over all children of this PageElement in a + breadth-first sequence. + + :yield: A sequence of PageElements. + """ if not len(self.contents): return stopNode = self._last_descendant().next_element @@ -1276,262 +2075,102 @@ class Tag(PageElement): current = current.next_element # CSS selector code + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element. - _selector_combinators = ['>', '+', '~'] - _select_debug = False - def select_one(self, selector): - """Perform a CSS selection operation on the current element.""" - value = self.select(selector, limit=1) - if value: - return value[0] - return None + :param selector: A CSS selector. - def select(self, selector, _candidate_generator=None, limit=None): - """Perform a CSS selection operation on the current element.""" - - # Handle grouping selectors if ',' exists, ie: p,a - if ',' in selector: - context = [] - for partial_selector in selector.split(','): - partial_selector = partial_selector.strip() - if partial_selector == '': - raise ValueError('Invalid group selection syntax: %s' % selector) - candidates = self.select(partial_selector, limit=limit) - for candidate in candidates: - if candidate not in context: - context.append(candidate) - - if limit and len(context) >= limit: - break - return context + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. - tokens = selector.split() - current_context = [self] + :param kwargs: Keyword arguments to be passed into Soup Sieve's + soupsieve.select() method. - if tokens[-1] in self._selector_combinators: - raise ValueError( - 'Final combinator "%s" is missing an argument.' % tokens[-1]) + :return: A Tag. + :rtype: bs4.element.Tag + """ + return self.css.select_one(selector, namespaces, **kwargs) - if self._select_debug: - print('Running CSS selector "%s"' % selector) + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. - for index, token in enumerate(tokens): - new_context = [] - new_context_ids = set([]) + This uses the SoupSieve library. - if tokens[index-1] in self._selector_combinators: - # This token was consumed by the previous combinator. Skip it. - if self._select_debug: - print(' Token was consumed by the previous combinator.') - continue + :param selector: A string containing a CSS selector. - if self._select_debug: - print(' Considering token "%s"' % token) - recursive_candidate_generator = None - tag_name = None - - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) - - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches - - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match - - elif ':' in token: - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is None: - pseudo_type = pseudo - pseudo_value = None - else: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - if self.count > self.destination: - # Stop the generator that's sending us - # these things. - raise StopIteration() - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') - - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) - print('-' * 40) - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) - yield i - if self._select_debug: - print('-' * 40) - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print(' Default candidate generator, tag name="%s"' % check) - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator - else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator - - count = 0 - for tag in current_context: - if self._select_debug: - print(" Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs))) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. - break - if checker is None or result: - if self._select_debug: - print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - if limit and len(new_context) >= limit: - break - elif self._select_debug: - print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) - - - current_context = new_context - - if self._select_debug: - print("Final verdict:") - for i in current_context: - print(" %s %s" % (i.name, i.attrs)) - return current_context + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Keyword arguments to be passed into SoupSieve's + soupsieve.select() method. + + :return: A ResultSet of Tags. + :rtype: bs4.element.ResultSet + """ + return self.css.select(selector, namespaces, limit, **kwargs) + + @property + def css(self): + """Return an interface to the CSS selector API.""" + return CSS(self) # Old names for backwards compatibility def childGenerator(self): + """Deprecated generator.""" return self.children def recursiveChildGenerator(self): + """Deprecated generator.""" return self.descendants def has_key(self, key): - """This was kind of misleading because has_key() (attributes) - was different from __in__ (contents). has_key() is gone in - Python 3, anyway.""" - warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( - key)) + """Deprecated method. This was kind of misleading because has_key() + (attributes) was different from __in__ (contents). + + has_key() is gone in Python 3, anyway. + """ + warnings.warn( + 'has_key is deprecated. Use has_attr(key) instead.', + DeprecationWarning, stacklevel=2 + ) return self.has_attr(key) # Next, a couple classes to represent queries and their results. class SoupStrainer(object): """Encapsulates a number of ways of matching a markup element (tag or - text).""" + string). + + This is primarily used to underpin the find_* methods, but you can + create one yourself and pass it in as `parse_only` to the + `BeautifulSoup` constructor, to parse a subset of a large + document. + """ + + def __init__(self, name=None, attrs={}, string=None, **kwargs): + """Constructor. + + The SoupStrainer constructor takes the same arguments passed + into the find_* methods. See the online documentation for + detailed explanations. + + :param name: A filter on tag name. + :param attrs: A dictionary of filters on attribute values. + :param string: A filter for a NavigableString with specific text. + :kwargs: A dictionary of filters on attribute values. + """ + if string is None and 'text' in kwargs: + string = kwargs.pop('text') + warnings.warn( + "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", + DeprecationWarning, stacklevel=2 + ) - def __init__(self, name=None, attrs={}, text=None, **kwargs): self.name = self._normalize_search_value(name) if not isinstance(attrs, dict): # Treat a non-dict value for attrs as a search for the 'class' @@ -1556,12 +2195,15 @@ class SoupStrainer(object): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs - self.text = self._normalize_search_value(text) + self.string = self._normalize_search_value(string) + + # DEPRECATED but just in case someone is checking this. + self.text = self.string def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match') + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -1589,19 +2231,40 @@ class SoupStrainer(object): return str(str(value)) def __str__(self): - if self.text: - return self.text + """A human-readable representation of this SoupStrainer.""" + if self.string: + return self.string else: return "%s|%s" % (self.name, self.attrs) def search_tag(self, markup_name=None, markup_attrs={}): + """Check whether a Tag with the given name and attributes would + match this SoupStrainer. + + Used prospectively to decide whether to even bother creating a Tag + object. + + :param markup_name: A tag name as found in some markup. + :param markup_attrs: A dictionary of attributes as found in some markup. + + :return: True if the prospective tag would match this SoupStrainer; + False otherwise. + """ found = None markup = None if isinstance(markup_name, Tag): markup = markup_name markup_attrs = markup + + if isinstance(self.name, str): + # Optimization for a very common case where the user is + # searching for a tag with one specific name, and we're + # looking at a tag with a different name. + if markup and not markup.prefix and self.name != markup.name: + return False + call_function_with_tag_data = ( - isinstance(self.name, collections.abc.Callable) + isinstance(self.name, Callable) and not isinstance(markup_name, Tag)) if ((not self.name) @@ -1630,13 +2293,22 @@ class SoupStrainer(object): found = markup else: found = markup_name - if found and self.text and not self._matches(found.string, self.text): + if found and self.string and not self._matches(found.string, self.string): found = None return found + + # For BS3 compatibility. searchTag = search_tag def search(self, markup): - # print 'looking for %s in %s' % (self, markup) + """Find all items in `markup` that match this SoupStrainer. + + Used by the core _find_all() method, which is ultimately + called by all find_* methods. + + :param markup: A PageElement or a list of them. + """ + # print('looking for %s in %s' % (self, markup)) found = None # If given a list of items, scan it for a text element that # matches. @@ -1649,49 +2321,44 @@ class SoupStrainer(object): # If it's a Tag, make sure its name or attributes match. # Don't bother with Tags if we're searching for text. elif isinstance(markup, Tag): - if not self.text or self.name or self.attrs: + if not self.string or self.name or self.attrs: found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ isinstance(markup, str): - if not self.name and not self.attrs and self._matches(markup, self.text): + if not self.name and not self.attrs and self._matches(markup, self.string): found = markup else: raise Exception( "I don't know how to match against a %s" % markup.__class__) return found - def _matches(self, markup, match_against): - # print u"Matching %s against %s" % (markup, match_against) + def _matches(self, markup, match_against, already_tried=None): + # print(u"Matching %s against %s" % (markup, match_against)) result = False if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. - if (isinstance(match_against, str) - and ' ' in match_against): - # A bit of a special case. If they try to match "foo - # bar" on a multivalue attribute's value, only accept - # the literal value "foo bar" - # - # XXX This is going to be pretty slow because we keep - # splitting match_against. But it shouldn't come up - # too often. - return (whitespace_re.split(match_against) == markup) - else: - for item in markup: - if self._matches(item, match_against): - return True - return False + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False if match_against is True: # True matches any non-None value. return markup is not None - if isinstance(match_against, collections.abc.Callable): + if isinstance(match_against, Callable): return match_against(markup) # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. + original_markup = markup if isinstance(markup, Tag): markup = markup.name @@ -1702,23 +2369,67 @@ class SoupStrainer(object): # None matches None, False, an empty string, an empty list, and so on. return not match_against - if isinstance(match_against, str): + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, str)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, str): # Exact string match - return markup == match_against + match = markup == match_against - if hasattr(match_against, 'match'): + if not match and hasattr(match_against, 'search'): # Regexp match return match_against.search(markup) - if hasattr(match_against, '__iter__'): - # The markup must be an exact match against something - # in the iterable. - return markup in match_against + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match class ResultSet(list): """A ResultSet is just a list that keeps track of the SoupStrainer that created it.""" def __init__(self, source, result=()): + """Constructor. + + :param source: A SoupStrainer. + :param result: A list of PageElements. + """ super(ResultSet, self).__init__(result) self.source = source + + def __getattr__(self, key): + """Raise a helpful exception to explain a common code fix.""" + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key + ) |