diff options
Diffstat (limited to 'bitbake/lib/bs4/dammit.py')
-rw-r--r-- | bitbake/lib/bs4/dammit.py | 409 |
1 files changed, 332 insertions, 77 deletions
diff --git a/bitbake/lib/bs4/dammit.py b/bitbake/lib/bs4/dammit.py index 68d419feb5..692433c57a 100644 --- a/bitbake/lib/bs4/dammit.py +++ b/bitbake/lib/bs4/dammit.py @@ -6,69 +6,185 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ +# Use of this source code is governed by the MIT license. __license__ = "MIT" -from pdb import set_trace -import codecs from html.entities import codepoint2name +from collections import defaultdict +import codecs import re import logging import string -# Import a library to autodetect character encodings. -chardet_type = None +# Import a library to autodetect character encodings. We'll support +# any of a number of libraries that all support the same API: +# +# * cchardet +# * chardet +# * charset-normalizer +chardet_module = None try: - # First try the fast C implementation. # PyPI package: cchardet - import cchardet - def chardet_dammit(s): - return cchardet.detect(s)['encoding'] + import cchardet as chardet_module except ImportError: try: - # Fall back to the pure Python implementation # Debian package: python-chardet # PyPI package: chardet - import chardet - def chardet_dammit(s): - return chardet.detect(s)['encoding'] - #import chardet.constants - #chardet.constants._debug = 1 + import chardet as chardet_module except ImportError: - # No chardet available. - def chardet_dammit(s): - return None + try: + # PyPI package: charset-normalizer + import charset_normalizer as chardet_module + except ImportError: + # No chardet available. + chardet_module = None -# Available from http://cjkpython.i18n.org/. -try: - import iconv_codec -except ImportError: - pass +if chardet_module: + def chardet_dammit(s): + if isinstance(s, str): + return None + return chardet_module.detect(s)['encoding'] +else: + def chardet_dammit(s): + return None -xml_encoding_re = re.compile( - '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) -html_meta_re = re.compile( - '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) +# Build bytestring and Unicode versions of regular expressions for finding +# a declared encoding inside an XML or HTML document. +xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' +html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' +encoding_res = dict() +encoding_res[bytes] = { + 'html' : re.compile(html_meta.encode("ascii"), re.I), + 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), +} +encoding_res[str] = { + 'html' : re.compile(html_meta, re.I), + 'xml' : re.compile(xml_encoding, re.I) +} + +from html.entities import html5 class EntitySubstitution(object): - - """Substitute XML or HTML entities for the corresponding characters.""" + """The ability to substitute XML or HTML entities for certain characters.""" def _populate_class_variables(): - lookup = {} - reverse_lookup = {} - characters_for_re = [] + """Initialize variables used by this class to manage the plethora of + HTML5 named entities. + + This function returns a 3-tuple containing two dictionaries + and a regular expression: + + unicode_to_name - A mapping of Unicode strings like "⦨" to + entity names like "angmsdaa". When a single Unicode string has + multiple entity names, we try to choose the most commonly-used + name. + + name_to_unicode: A mapping of entity names like "angmsdaa" to + Unicode strings like "⦨". + + named_entity_re: A regular expression matching (almost) any + Unicode string that corresponds to an HTML5 named entity. + """ + unicode_to_name = {} + name_to_unicode = {} + + short_entities = set() + long_entities_by_first_character = defaultdict(set) + + for name_with_semicolon, character in sorted(html5.items()): + # "It is intentional, for legacy compatibility, that many + # code points have multiple character reference names. For + # example, some appear both with and without the trailing + # semicolon, or with different capitalizations." + # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references + # + # The parsers are in charge of handling (or not) character + # references with no trailing semicolon, so we remove the + # semicolon whenever it appears. + if name_with_semicolon.endswith(';'): + name = name_with_semicolon[:-1] + else: + name = name_with_semicolon + + # When parsing HTML, we want to recognize any known named + # entity and convert it to a sequence of Unicode + # characters. + if name not in name_to_unicode: + name_to_unicode[name] = character + + # When _generating_ HTML, we want to recognize special + # character sequences that _could_ be converted to named + # entities. + unicode_to_name[character] = name + + # We also need to build a regular expression that lets us + # _find_ those characters in output strings so we can + # replace them. + # + # This is tricky, for two reasons. + + if (len(character) == 1 and ord(character) < 128 + and character not in '<>&'): + # First, it would be annoying to turn single ASCII + # characters like | into named entities like + # |. The exceptions are <>&, which we _must_ + # turn into named entities to produce valid HTML. + continue + + if len(character) > 1 and all(ord(x) < 128 for x in character): + # We also do not want to turn _combinations_ of ASCII + # characters like 'fj' into named entities like 'fj', + # though that's more debateable. + continue + + # Second, some named entities have a Unicode value that's + # a subset of the Unicode value for some _other_ named + # entity. As an example, \u2267' is ≧, + # but '\u2267\u0338' is ≧̸. Our regular + # expression needs to match the first two characters of + # "\u2267\u0338foo", but only the first character of + # "\u2267foo". + # + # In this step, we build two sets of characters that + # _eventually_ need to go into the regular expression. But + # we won't know exactly what the regular expression needs + # to look like until we've gone through the entire list of + # named entities. + if len(character) == 1: + short_entities.add(character) + else: + long_entities_by_first_character[character[0]].add(character) + + # Now that we've been through the entire list of entities, we + # can create a regular expression that matches any of them. + particles = set() + for short in short_entities: + long_versions = long_entities_by_first_character[short] + if not long_versions: + particles.add(short) + else: + ignore = "".join([x[1] for x in long_versions]) + # This finds, e.g. \u2267 but only if it is _not_ + # followed by \u0338. + particles.add("%s(?![%s])" % (short, ignore)) + + for long_entities in list(long_entities_by_first_character.values()): + for long_entity in long_entities: + particles.add(long_entity) + + re_definition = "(%s)" % "|".join(particles) + + # If an entity shows up in both html5 and codepoint2name, it's + # likely that HTML5 gives it several different names, such as + # 'rsquo' and 'rsquor'. When converting Unicode characters to + # named entities, the codepoint2name name should take + # precedence where possible, since that's the more easily + # recognizable one. for codepoint, name in list(codepoint2name.items()): character = chr(codepoint) - if codepoint != 34: - # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. - characters_for_re.append(character) - lookup[character] = name - # But we do want to turn " into the quotation mark. - reverse_lookup[name] = character - re_definition = "[%s]" % "".join(characters_for_re) - return lookup, reverse_lookup, re.compile(re_definition) + unicode_to_name[character] = name + + return unicode_to_name, name_to_unicode, re.compile(re_definition) (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() @@ -81,20 +197,22 @@ class EntitySubstitution(object): } BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" ")") AMPERSAND_OR_BRACKET = re.compile("([<>&])") @classmethod def _substitute_html_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate HTML entity for a special character string.""" entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) return "&%s;" % entity @classmethod def _substitute_xml_entity(cls, matchobj): """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" + appropriate XML entity for a special character string.""" entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity @@ -189,6 +307,8 @@ class EntitySubstitution(object): containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that character with "é" will make it more readable to some people. + + :param s: A Unicode string. """ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( cls._substitute_html_entity, s) @@ -200,23 +320,65 @@ class EncodingDetector: Order of precedence: 1. Encodings you specifically tell EncodingDetector to try first - (the override_encodings argument to the constructor). + (the known_definite_encodings argument to the constructor). + + 2. An encoding determined by sniffing the document's byte-order mark. - 2. An encoding declared within the bytestring itself, either in an + 3. Encodings you specifically tell EncodingDetector to try if + byte-order mark sniffing fails (the user_encodings argument to the + constructor). + + 4. An encoding declared within the bytestring itself, either in an XML declaration (if the bytestring is to be interpreted as an XML document), or in a <meta> tag (if the bytestring is to be interpreted as an HTML document.) - 3. An encoding detected through textual analysis by chardet, + 5. An encoding detected through textual analysis by chardet, cchardet, or a similar external library. 4. UTF-8. 5. Windows-1252. + """ - def __init__(self, markup, override_encodings=None, is_html=False, - exclude_encodings=None): - self.override_encodings = override_encodings or [] + def __init__(self, markup, known_definite_encodings=None, + is_html=False, exclude_encodings=None, + user_encodings=None, override_encodings=None): + """Constructor. + + :param markup: Some markup in an unknown encoding. + + :param known_definite_encodings: When determining the encoding + of `markup`, these encodings will be tried first, in + order. In HTML terms, this corresponds to the "known + definite encoding" step defined here: + https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding + + :param user_encodings: These encodings will be tried after the + `known_definite_encodings` have been tried and failed, and + after an attempt to sniff the encoding by looking at a + byte order mark has failed. In HTML terms, this + corresponds to the step "user has explicitly instructed + the user agent to override the document's character + encoding", defined here: + https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding + + :param override_encodings: A deprecated alias for + known_definite_encodings. Any encodings here will be tried + immediately after the encodings in + known_definite_encodings. + + :param is_html: If True, this markup is considered to be + HTML. Otherwise it's assumed to be XML. + + :param exclude_encodings: These encodings will not be tried, + even if they otherwise would be. + + """ + self.known_definite_encodings = list(known_definite_encodings or []) + if override_encodings: + self.known_definite_encodings += override_encodings + self.user_encodings = user_encodings or [] exclude_encodings = exclude_encodings or [] self.exclude_encodings = set([x.lower() for x in exclude_encodings]) self.chardet_encoding = None @@ -227,6 +389,12 @@ class EncodingDetector: self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) def _usable(self, encoding, tried): + """Should we even bother to try this encoding? + + :param encoding: Name of an encoding. + :param tried: Encodings that have already been tried. This will be modified + as a side effect. + """ if encoding is not None: encoding = encoding.lower() if encoding in self.exclude_encodings: @@ -238,9 +406,14 @@ class EncodingDetector: @property def encodings(self): - """Yield a number of encodings that might work for this markup.""" + """Yield a number of encodings that might work for this markup. + + :yield: A sequence of strings. + """ tried = set() - for e in self.override_encodings: + + # First, try the known definite encodings + for e in self.known_definite_encodings: if self._usable(e, tried): yield e @@ -249,6 +422,12 @@ class EncodingDetector: if self._usable(self.sniffed_encoding, tried): yield self.sniffed_encoding + # Sniffing the byte-order mark did nothing; try the user + # encodings. + for e in self.user_encodings: + if self._usable(e, tried): + yield e + # Look within the document for an XML or HTML encoding # declaration. if self.declared_encoding is None: @@ -271,7 +450,11 @@ class EncodingDetector: @classmethod def strip_byte_order_mark(cls, data): - """If a byte-order mark is present, strip it and return the encoding it implies.""" + """If a byte-order mark is present, strip it and return the encoding it implies. + + :param data: Some markup. + :return: A 2-tuple (modified data, implied encoding) + """ encoding = None if isinstance(data, str): # Unicode data cannot have a byte-order mark. @@ -303,21 +486,36 @@ class EncodingDetector: An HTML encoding is declared in a <meta> tag, hopefully near the beginning of the document. + + :param markup: Some markup. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param search_entire_document: Since an encoding is supposed to declared near the beginning + of the document, most of the time it's only necessary to search a few kilobytes of data. + Set this to True to force this method to search the entire document. """ if search_entire_document: xml_endpos = html_endpos = len(markup) else: xml_endpos = 1024 html_endpos = max(2048, int(len(markup) * 0.05)) - + + if isinstance(markup, bytes): + res = encoding_res[bytes] + else: + res = encoding_res[str] + + xml_re = res['xml'] + html_re = res['html'] declared_encoding = None - declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) if not declared_encoding_match and is_html: - declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + declared_encoding_match = html_re.search(markup, endpos=html_endpos) if declared_encoding_match is not None: - declared_encoding = declared_encoding_match.groups()[0].decode( - 'ascii', 'replace') + declared_encoding = declared_encoding_match.groups()[0] if declared_encoding: + if isinstance(declared_encoding, bytes): + declared_encoding = declared_encoding.decode('ascii', 'replace') return declared_encoding.lower() return None @@ -340,15 +538,53 @@ class UnicodeDammit: "iso-8859-2", ] - def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False, exclude_encodings=[]): + def __init__(self, markup, known_definite_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[], + user_encodings=None, override_encodings=None + ): + """Constructor. + + :param markup: A bytestring representing markup in an unknown encoding. + + :param known_definite_encodings: When determining the encoding + of `markup`, these encodings will be tried first, in + order. In HTML terms, this corresponds to the "known + definite encoding" step defined here: + https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding + + :param user_encodings: These encodings will be tried after the + `known_definite_encodings` have been tried and failed, and + after an attempt to sniff the encoding by looking at a + byte order mark has failed. In HTML terms, this + corresponds to the step "user has explicitly instructed + the user agent to override the document's character + encoding", defined here: + https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding + + :param override_encodings: A deprecated alias for + known_definite_encodings. Any encodings here will be tried + immediately after the encodings in + known_definite_encodings. + + :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted + to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. + Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' + will convert them to HTML entity references. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be considered, even + if the sniffing code thinks they might make sense. + + """ self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False self.is_html = is_html - + self.log = logging.getLogger(__name__) self.detector = EncodingDetector( - markup, override_encodings, is_html, exclude_encodings) + markup, known_definite_encodings, is_html, exclude_encodings, + user_encodings, override_encodings + ) # Short-circuit if the data is in Unicode to begin with. if isinstance(markup, str) or markup == '': @@ -376,9 +612,10 @@ class UnicodeDammit: if encoding != "ascii": u = self._convert_from(encoding, "replace") if u is not None: - logging.warning( + self.log.warning( "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER.") + "replaced with REPLACEMENT CHARACTER." + ) self.contains_replacement_characters = True break @@ -407,6 +644,10 @@ class UnicodeDammit: return sub def _convert_from(self, proposed, errors="strict"): + """Attempt to convert the markup to the proposed encoding. + + :param proposed: The name of a character encoding. + """ proposed = self.find_codec(proposed) if not proposed or (proposed, errors) in self.tried_encodings: return None @@ -421,30 +662,40 @@ class UnicodeDammit: markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) try: - #print "Trying to convert document to %s (errors=%s)" % ( - # proposed, errors) + #print("Trying to convert document to %s (errors=%s)" % ( + # proposed, errors)) u = self._to_unicode(markup, proposed, errors) self.markup = u self.original_encoding = proposed except Exception as e: - #print "That didn't work!" - #print e + #print("That didn't work!") + #print(e) return None - #print "Correct encoding: %s" % proposed + #print("Correct encoding: %s" % proposed) return self.markup def _to_unicode(self, data, encoding, errors="strict"): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' + """Given a string and its encoding, decodes the string into Unicode. + + :param encoding: The name of an encoding. + """ return str(data, encoding, errors) @property def declared_html_encoding(self): + """If the markup is an HTML document, returns the encoding declared _within_ + the document. + """ if not self.is_html: return None return self.detector.declared_encoding def find_codec(self, charset): + """Convert the name of a character set to a codec name. + + :param charset: The name of a character set. + :return: The name of a codec. + """ value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) or (charset and self._codec(charset.replace("-", ""))) or (charset and self._codec(charset.replace("-", "_"))) @@ -734,7 +985,7 @@ class UnicodeDammit: 0xde : b'\xc3\x9e', # Þ 0xdf : b'\xc3\x9f', # ß 0xe0 : b'\xc3\xa0', # à - 0xe1 : b'\xa1', # á + 0xe1 : b'\xa1', # á 0xe2 : b'\xc3\xa2', # â 0xe3 : b'\xc3\xa3', # ã 0xe4 : b'\xc3\xa4', # ä @@ -783,12 +1034,16 @@ class UnicodeDammit: Currently the only situation supported is Windows-1252 (or its subset ISO-8859-1), embedded in UTF-8. - The input must be a bytestring. If you've already converted - the document to Unicode, you're too late. - - The output is a bytestring in which `embedded_encoding` - characters have been converted to their `main_encoding` - equivalents. + :param in_bytes: A bytestring that you suspect contains + characters from multiple encodings. Note that this _must_ + be a bytestring. If you've already converted the document + to Unicode, you're too late. + :param main_encoding: The primary encoding of `in_bytes`. + :param embedded_encoding: The encoding that was used to embed characters + in the main document. + :return: A bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. """ if embedded_encoding.replace('_', '-').lower() not in ( 'windows-1252', 'windows_1252'): |