diff options
Diffstat (limited to 'bitbake/lib/bs4/builder/_html5lib.py')
-rw-r--r-- | bitbake/lib/bs4/builder/_html5lib.py | 250 |
1 files changed, 196 insertions, 54 deletions
diff --git a/bitbake/lib/bs4/builder/_html5lib.py b/bitbake/lib/bs4/builder/_html5lib.py index 2b7a70aa11..7c46a85118 100644 --- a/bitbake/lib/bs4/builder/_html5lib.py +++ b/bitbake/lib/bs4/builder/_html5lib.py @@ -1,10 +1,14 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + __all__ = [ 'HTML5TreeBuilder', ] -from pdb import set_trace import warnings +import re from bs4.builder import ( + DetectsXMLParsedAsHTML, PERMISSIVE, HTML, HTML_5, @@ -12,17 +16,13 @@ from bs4.builder import ( ) from bs4.element import ( NamespacedAttribute, - whitespace_re, + nonwhitespace_re, ) import html5lib -try: - # html5lib >= 0.99999999/1.0b9 - from html5lib.treebuilders import base as treebuildersbase -except ImportError: - # html5lib <= 0.9999999/1.0b8 - from html5lib.treebuilders import _base as treebuildersbase -from html5lib.constants import namespaces - +from html5lib.constants import ( + namespaces, + prefixes, + ) from bs4.element import ( Comment, Doctype, @@ -30,13 +30,37 @@ from bs4.element import ( Tag, ) +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError as e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" + """Use html5lib to build a tree. + + Note that this TreeBuilder does not support some features common + to HTML TreeBuilders. Some of these features could theoretically + be implemented, but at the very least it's quite difficult, + because html5lib moves the parse tree around as it's being built. + + * This TreeBuilder doesn't use different subclasses of NavigableString + based on the name of the tag in which the string was found. + + * You can't use a SoupStrainer to parse only part of a document. + """ NAME = "html5lib" features = [NAME, PERMISSIVE, HTML_5, HTML] + # html5lib can tell us which line number and position in the + # original file is the source of an element. + TRACKS_LINE_NUMBERS = True + def prepare_markup(self, markup, user_specified_encoding, document_declared_encoding=None, exclude_encodings=None): # Store the user-specified encoding for use later on. @@ -46,27 +70,56 @@ class HTML5TreeBuilder(HTMLTreeBuilder): # ATM because the html5lib TreeBuilder doesn't use # UnicodeDammit. if exclude_encodings: - warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + warnings.warn( + "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.", + stacklevel=3 + ) + + # html5lib only parses HTML, so if it's given XML that's worth + # noting. + DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml( + markup, stacklevel=3 + ) + yield (markup, None, None, False) # These methods are defined by Beautiful Soup. def feed(self, markup): if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + warnings.warn( + "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.", + stacklevel=4 + ) parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) - + self.underlying_builder.parser = parser + extra_kwargs = dict() + if not isinstance(markup, str): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + # Set the character encoding detected by the tokenizer. if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] - + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, str): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + self.underlying_builder.parser = None + def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) + namespaceHTMLElements, self.soup, + store_line_numbers=self.store_line_numbers + ) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -74,12 +127,30 @@ class HTML5TreeBuilder(HTMLTreeBuilder): return '<html><head></head><body>%s</body></html>' % fragment -class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): - - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, namespaceHTMLElements, soup=None, + store_line_numbers=True, **kwargs): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup( + "", "html.parser", store_line_numbers=store_line_numbers, + **kwargs + ) + # TODO: What are **kwargs exactly? Should they be passed in + # here in addition to/instead of being passed to the BeautifulSoup + # constructor? super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + # This will be set later to an html5lib.html5parser.HTMLParser + # object, which we can use to track the current line number. + self.parser = None + self.store_line_numbers = store_line_numbers + def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) @@ -93,14 +164,26 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): - tag = self.soup.new_tag(name, namespace) + kwargs = {} + if self.parser and self.store_line_numbers: + # This represents the point immediately after the end of the + # tag. We don't know when the tag started, but we do know + # where it ended -- the character just before this one. + sourceline, sourcepos = self.parser.tokenizer.stream.position() + kwargs['sourceline'] = sourceline + kwargs['sourcepos'] = sourcepos-1 + tag = self.soup.new_tag(name, namespace, **kwargs) + return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): - self.soup = BeautifulSoup("") + from bs4 import BeautifulSoup + # TODO: Why is the parser 'html.parser' here? To avoid an + # infinite loop? + self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) @@ -112,7 +195,57 @@ class TreeBuilderForHtml5lib(treebuildersbase.TreeBuilder): return self.soup def getFragment(self): - return treebuildersbase.TreeBuilder.getFragment(self).element + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) + else: + rv.append("|%s<!DOCTYPE >" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s<!-- %s -->" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in list(element.attrs.items()): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) class AttrList(object): def __init__(self, element): @@ -123,14 +256,14 @@ class AttrList(object): def __setitem__(self, name, value): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. - list_attr = HTML5TreeBuilder.cdata_list_attributes - if (name in list_attr['*'] + list_attr = self.element.cdata_list_attributes or {} + if (name in list_attr.get('*', []) or (self.element.name in list_attr - and name in list_attr[self.element.name])): + and name in list_attr.get(self.element.name, []))): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): - value = whitespace_re.split(value) + value = nonwhitespace_re.findall(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -144,9 +277,9 @@ class AttrList(object): return name in list(self.attrs.keys()) -class Element(treebuildersbase.Node): +class Element(treebuilder_base.Node): def __init__(self, element, soup, namespace): - treebuildersbase.Node.__init__(self, element.name) + treebuilder_base.Node.__init__(self, element.name) self.element = element self.soup = soup self.namespace = namespace @@ -165,13 +298,15 @@ class Element(treebuildersbase.Node): child = node elif node.element.__class__ == NavigableString: string_child = child = node.element + node.parent = self else: child = node.element + node.parent = self if not isinstance(child, str) and child.parent is not None: node.element.extract() - if (string_child and self.element.contents + if (string_child is not None and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # We are appending a string onto another string. # TODO This has O(n^2) performance, for input like @@ -204,12 +339,12 @@ class Element(treebuildersbase.Node): most_recent_element=most_recent_element) def getAttributes(self): + if isinstance(self.element, Comment): + return {} return AttrList(self.element) def setAttributes(self, attributes): - if attributes is not None and len(attributes) > 0: - converted_attributes = [] for name, value in list(attributes.items()): if isinstance(name, tuple): @@ -231,11 +366,11 @@ class Element(treebuildersbase.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - text = TextNode(self.soup.new_string(data), self.soup) - self.insertBefore(data, insertBefore) + self.insertBefore(text, insertBefore) else: - self.appendChild(data) + self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) @@ -254,9 +389,10 @@ class Element(treebuildersbase.Node): def reparentChildren(self, new_parent): """Move all of this tag's children into another tag.""" - # print "MOVE", self.element.contents - # print "FROM", self.element - # print "TO", new_parent.element + # print("MOVE", self.element.contents) + # print("FROM", self.element) + # print("TO", new_parent.element) + element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -275,29 +411,35 @@ class Element(treebuildersbase.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - if new_parents_last_descendant: + if new_parents_last_descendant is not None: first_child.previous_element = new_parents_last_descendant else: first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant: + if new_parents_last_descendant is not None: new_parents_last_descendant.next_element = first_child else: new_parent_element.next_element = first_child - if new_parents_last_child: + if new_parents_last_child is not None: new_parents_last_child.next_sibling = first_child - # Fix the last child's next_element and next_sibling - last_child = to_append[-1] - last_child.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element: - new_parents_last_descendant_next_element.previous_element = last_child - last_child.next_sibling = None + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element is not None: + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None for child in to_append: child.parent = new_parent_element @@ -307,9 +449,9 @@ class Element(treebuildersbase.Node): element.contents = [] element.next_element = final_next_element - # print "DONE WITH MOVE" - # print "FROM", self.element - # print "TO", new_parent_element + # print("DONE WITH MOVE") + # print("FROM", self.element) + # print("TO", new_parent_element) def cloneNode(self): tag = self.soup.new_tag(self.element.name, self.namespace) @@ -331,7 +473,7 @@ class Element(treebuildersbase.Node): class TextNode(Element): def __init__(self, element, soup): - treebuildersbase.Node.__init__(self, None) + treebuilder_base.Node.__init__(self, None) self.element = element self.soup = soup |