From efec3a4dcf8803861ff0e49921f4c19de2b19282 Mon Sep 17 00:00:00 2001 From: user Date: Sat, 7 Dec 2019 14:35:21 -0500 Subject: [PATCH] includes bs4 in the repository now --- bs4/__init__.py | 616 +++++ bs4/__init__.py.bak | 616 +++++ bs4/__pycache__/__init__.cpython-36.pyc | Bin 0 -> 16895 bytes bs4/__pycache__/dammit.cpython-36.pyc | Bin 0 -> 18774 bytes bs4/__pycache__/element.cpython-36.pyc | Bin 0 -> 41980 bytes bs4/__pycache__/formatter.cpython-36.pyc | Bin 0 -> 2996 bytes bs4/__pycache__/testing.cpython-36.pyc | Bin 0 -> 38175 bytes bs4/builder/__init__.py | 367 +++ bs4/builder/__init__.py.bak | 367 +++ .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 10084 bytes .../__pycache__/_html5lib.cpython-36.pyc | Bin 0 -> 11599 bytes .../__pycache__/_htmlparser.cpython-36.pyc | Bin 0 -> 8958 bytes bs4/builder/__pycache__/_lxml.cpython-36.pyc | Bin 0 -> 8029 bytes bs4/builder/_html5lib.py | 426 ++++ bs4/builder/_html5lib.py.bak | 426 ++++ bs4/builder/_htmlparser.py | 350 +++ bs4/builder/_htmlparser.py.bak | 350 +++ bs4/builder/_lxml.py | 296 +++ bs4/builder/_lxml.py.bak | 296 +++ bs4/dammit.py | 850 +++++++ bs4/dammit.py.bak | 850 +++++++ bs4/diagnose.py | 224 ++ bs4/diagnose.py.bak | 224 ++ bs4/element.py | 1579 ++++++++++++ bs4/element.py.bak | 1579 ++++++++++++ bs4/formatter.py | 99 + bs4/formatter.py.bak | 99 + bs4/testing.py | 992 ++++++++ bs4/testing.py.bak | 992 ++++++++ bs4/tests/__init__.py | 1 + bs4/tests/__pycache__/__init__.cpython-36.pyc | Bin 0 -> 227 bytes .../test_builder_registry.cpython-36.pyc | Bin 0 -> 5049 bytes .../__pycache__/test_docs.cpython-36.pyc | Bin 0 -> 451 bytes .../test_htmlparser.cpython-36.pyc | Bin 0 -> 2493 bytes .../__pycache__/test_tree.cpython-36.pyc | Bin 0 -> 91500 bytes bs4/tests/test_builder_registry.py | 147 ++ bs4/tests/test_docs.py | 36 + bs4/tests/test_html5lib.py | 170 ++ bs4/tests/test_html5lib.py.bak | 170 ++ bs4/tests/test_htmlparser.py | 47 + bs4/tests/test_lxml.py | 100 + bs4/tests/test_lxml.py.bak | 100 + bs4/tests/test_soup.py | 567 +++++ bs4/tests/test_soup.py.bak | 567 +++++ bs4/tests/test_tree.py | 2205 +++++++++++++++++ bs4/tests/test_tree.py.bak | 2205 +++++++++++++++++ 46 files changed, 17913 insertions(+) create mode 100644 bs4/__init__.py create mode 100644 bs4/__init__.py.bak create mode 100644 bs4/__pycache__/__init__.cpython-36.pyc create mode 100644 bs4/__pycache__/dammit.cpython-36.pyc create mode 100644 bs4/__pycache__/element.cpython-36.pyc create mode 100644 bs4/__pycache__/formatter.cpython-36.pyc create mode 100644 bs4/__pycache__/testing.cpython-36.pyc create mode 100644 bs4/builder/__init__.py create mode 100644 bs4/builder/__init__.py.bak create mode 100644 bs4/builder/__pycache__/__init__.cpython-36.pyc create mode 100644 bs4/builder/__pycache__/_html5lib.cpython-36.pyc create mode 100644 bs4/builder/__pycache__/_htmlparser.cpython-36.pyc create mode 100644 bs4/builder/__pycache__/_lxml.cpython-36.pyc create mode 100644 bs4/builder/_html5lib.py create mode 100644 bs4/builder/_html5lib.py.bak create mode 100644 bs4/builder/_htmlparser.py create mode 100644 bs4/builder/_htmlparser.py.bak create mode 100644 bs4/builder/_lxml.py create mode 100644 bs4/builder/_lxml.py.bak create mode 100644 bs4/dammit.py create mode 100644 bs4/dammit.py.bak create mode 100644 bs4/diagnose.py create mode 100644 bs4/diagnose.py.bak create mode 100644 bs4/element.py create mode 100644 bs4/element.py.bak create mode 100644 bs4/formatter.py create mode 100644 bs4/formatter.py.bak create mode 100644 bs4/testing.py create mode 100644 bs4/testing.py.bak create mode 100644 bs4/tests/__init__.py create mode 100644 bs4/tests/__pycache__/__init__.cpython-36.pyc create mode 100644 bs4/tests/__pycache__/test_builder_registry.cpython-36.pyc create mode 100644 bs4/tests/__pycache__/test_docs.cpython-36.pyc create mode 100644 bs4/tests/__pycache__/test_htmlparser.cpython-36.pyc create mode 100644 bs4/tests/__pycache__/test_tree.cpython-36.pyc create mode 100644 bs4/tests/test_builder_registry.py create mode 100644 bs4/tests/test_docs.py create mode 100644 bs4/tests/test_html5lib.py create mode 100644 bs4/tests/test_html5lib.py.bak create mode 100644 bs4/tests/test_htmlparser.py create mode 100644 bs4/tests/test_lxml.py create mode 100644 bs4/tests/test_lxml.py.bak create mode 100644 bs4/tests/test_soup.py create mode 100644 bs4/tests/test_soup.py.bak create mode 100644 bs4/tests/test_tree.py create mode 100644 bs4/tests/test_tree.py.bak diff --git a/bs4/__init__.py b/bs4/__init__.py new file mode 100644 index 0000000..95ca229 --- /dev/null +++ b/bs4/__init__.py @@ -0,0 +1,616 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. + +Beautiful Soup works with Python 2.7 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.8.0" +__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import os +import re +import sys +import traceback +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = '[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + + """ + + if 'convertEntities' in kwargs: + del kwargs['convertEntities'] + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if from_encoding and isinstance(markup, str): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, str): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + + self.builder = builder + self.is_xml = builder.is_xml + self.known_xml = self.is_xml + self._namespaces = dict() + self.parse_only = parse_only + + self.builder.initialize_soup(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, str) and not '<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, str) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception as e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + if isinstance(markup, str): + markup = markup.encode("utf8") + warnings.warn( + '"%s" looks like a filename, not markup. You should' + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % markup) + self._check_markup_is_url(markup) + + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, str): + space = ' ' + cant_start_with = ("http:", "https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.preserve_whitespace_tag_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): + """Create a new tag associated with this soup.""" + kwattrs.update(attrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) + + def new_string(self, s, subclass=NavigableString): + """Create a new NavigableString associated with this soup.""" + return subclass(s) + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag is not None: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + + def endData(self, containerClass=NavigableString): + if self.current_data: + current_data = ''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Add an object to the parse tree.""" + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if previous_element is None: + previous_element = o.previous_element + + fix = parent.next_element is not None + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occurred + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element) + if tag is None: + return tag + if self._most_recent_element is not None: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = '\n' % encoding_part + else: + prefix = '' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print(soup.prettify()) diff --git a/bs4/__init__.py.bak b/bs4/__init__.py.bak new file mode 100644 index 0000000..9cd01c8 --- /dev/null +++ b/bs4/__init__.py.bak @@ -0,0 +1,616 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. + +Beautiful Soup works with Python 2.7 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.8.0" +__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import os +import re +import sys +import traceback +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = u'[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): + """Constructor. + + :param markup: A string or a file-like object representing + markup to be parsed. + + :param features: Desirable features of the parser to be used. This + may be the name of a specific parser ("lxml", "lxml-xml", + "html.parser", or "html5lib") or it may be the type of markup + to be used ("html", "html5", "xml"). It's recommended that you + name a specific parser, so that Beautiful Soup gives you the + same results across platforms and virtual environments. + + :param builder: A TreeBuilder subclass to instantiate (or + instance to use) instead of looking one up based on + `features`. You only need to use this if you've implemented a + custom TreeBuilder. + + :param parse_only: A SoupStrainer. Only parts of the document + matching the SoupStrainer will be considered. This is useful + when parsing part of a document that would otherwise be too + large to fit into memory. + + :param from_encoding: A string indicating the encoding of the + document to be parsed. Pass this in if Beautiful Soup is + guessing wrongly about the document's encoding. + + :param exclude_encodings: A list of strings indicating + encodings known to be wrong. Pass this in if you don't know + the document's encoding but you know Beautiful Soup's guess is + wrong. + + :param kwargs: For backwards compatibility purposes, the + constructor accepts certain keyword arguments used in + Beautiful Soup 3. None of these arguments do anything in + Beautiful Soup 4; they will result in a warning and then be ignored. + + Apart from this, any keyword arguments passed into the BeautifulSoup + constructor are propagated to the TreeBuilder constructor. This + makes it possible to configure a TreeBuilder beyond saying + which one to use. + + """ + + if 'convertEntities' in kwargs: + del kwargs['convertEntities'] + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + + # We need this information to track whether or not the builder + # was specified well enough that we can omit the 'you need to + # specify a parser' warning. + original_builder = builder + original_features = features + + if isinstance(builder, type): + # A builder class was passed in; it needs to be instantiated. + builder_class = builder + builder = None + elif builder is None: + if isinstance(features, basestring): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + + # At this point either we have a TreeBuilder instance in + # builder, or we have a builder_class that we can instantiate + # with the remaining **kwargs. + if builder is None: + builder = builder_class(**kwargs) + if not original_builder and not ( + original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES + ): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + # This code adapted from warnings.py so that we get the same line + # of code as our warnings.warn() call gets, even if the answer is wrong + # (as it may be in a multithreading situation). + caller = None + try: + caller = sys._getframe(1) + except ValueError: + pass + if caller: + globals = caller.f_globals + line_number = caller.f_lineno + else: + globals = sys.__dict__ + line_number= 1 + filename = globals.get('__file__') + if filename: + fnl = filename.lower() + if fnl.endswith((".pyc", ".pyo")): + filename = filename[:-1] + if filename: + # If there is no filename at all, the user is most likely in a REPL, + # and the warning is not necessary. + values = dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type + ) + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) + else: + if kwargs: + warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") + + self.builder = builder + self.is_xml = builder.is_xml + self.known_xml = self.is_xml + self._namespaces = dict() + self.parse_only = parse_only + + self.builder.initialize_soup(self) + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, unicode) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception, e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + if isinstance(markup, unicode): + markup = markup.encode("utf8") + warnings.warn( + '"%s" looks like a filename, not markup. You should' + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % markup) + self._check_markup_is_url(markup) + + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.preserve_whitespace_tag_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): + """Create a new tag associated with this soup.""" + kwattrs.update(attrs) + return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) + + def new_string(self, s, subclass=NavigableString): + """Create a new NavigableString associated with this soup.""" + return subclass(s) + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag is not None: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + + def endData(self, containerClass=NavigableString): + if self.current_data: + current_data = u''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Add an object to the parse tree.""" + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if previous_element is None: + previous_element = o.previous_element + + fix = parent.next_element is not None + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occurred + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element) + if tag is None: + return tag + if self._most_recent_element is not None: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'\n' % encoding_part + else: + prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/bs4/__pycache__/__init__.cpython-36.pyc b/bs4/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..843a8423d619919841f7d484f53af4ce46d6a0eb GIT binary patch literal 16895 zcmb7r>vJ5}m0v&Rg~0#>K@faOYC#WTWO7JRv@J7~EQ$mzdMSt|Axj>wJZwxifEmn8 z4{!Gnz+@1|3)yD8k)x{hZk)%ul1f!7`I^dyRP9zJf5Lu?t1F+9PpVX^QgKy2s8Y50 z{m$)fU_dgFfSTL4Z{I%m+;h%7uY3EAsj2ec{Gb2+$!x$aDx=e+#7SH6vJnfe*LQ}L!YZPWOk;Z1w_PYo~cpV_nPXZ^GF=kR+T?MJ*Bw9oiw(f$JNk9xDX zpY<-DM(+C2M&JPhO!n>$>X@(7@>)SRbfOJ6a=PxO z@3bPvcf%bb(sLiR*4@ZIUk-g&H8;-7tKGnBt>HQ7BsdEawI?RK6{yY7*=j`_$zaaq z+P4M3pkH%tM~?2W>PHc#-CA=x+ucqXxLpKAHll9lo1NCG(+V-wFmgK`-@_QU@T?ms zpZNBionYYjkMQ=E+lzdUC~x+HEyrCA2GJohoOeQy3P_hz8Va?pem$11hF31)eXw@; z+8saWxyp0ywVE5ahC$C+?C3W2PUx?%xr0tr3)Fgbn7>kcvvz6t+Kr&Uqgv}5k+ayW zI+rh9x^m(2rPsgXJPE39zkhorp2}EBf%<|AefbNlY@=$$GpmDE$MaP~`RlDPQakb7 zT@j&se%o(?9PhhobI`Av@zjT4bl~|n-EOxPRVU;8jhk-d#>E>!m%e%OV#L;QNtk_15FbyY9Nb z)bRz_@w+N$`VidS`fYGd4Vsk7c;cQPf*$w%D6Viac-3w7d==X(?)vb-kHAW-Ajr}V zUL+^gpeHh>CDpm?fYBi!IlU5l5pyjMK9B_o5ucf3J z^Z+dq_@vBv$A_E(#wypEh#RrMQdoS@mz4tPi6FQA?D>N#S1$Sk&mKEpj)FJPT`m{4 z97xK8YOP6@rGJH_68=wAd;^NvfMPbGm@Oz~sONPHsyR=!43&JvoAAn?no!DR?JBj*H@052M@6UQ? zytB9-^Pcmb$92wo!EO#MV_Dae`XIneh5KS3}700Wf1o}!*S z@vxc~SrFd!ejhzS%s`9xKq(pwKE^m+3KV^)1QxLrB3Xzx$cncPEo}l^5@;ZP^9fLb z;Bo-^h!Wp`?u9+B);@Y^_Ptu>9lHI(<^l(FoRaYi<|~^G@`DI{r4P-HPBOgnhr+ zT5I_pOkLpIKG%i6A-a^S$`2#QedM+}Lg!Z!d2djkSHl$G8Z=r&-5KtLk>7O|7hYM^ zV^zb2YNA&M;Q&h#dNAyr155|*1uQ}LJU9opMAK92ZUCZkb)d6dNI3PY27I2XvxQa% zp2V6j^7*KxA^>*$9%@04rsO;oFqa?p-Bn*z(Y_{fyVin#_R1nJT(H#;(+`|=KXO#d)1CI= zod*S>$0+Zv=e20?Fcnv=YiI4;n(ty&`{7#$33@JFqGLuLh$mJvP_=SwEH{gZ*_g+5 z0l!K7!vA2yzc(Knn+0|9F=}rak^HGVWY0uitjq?59;SN4L?((C&`s>IVnUTF` zp+@|U3fx&dvv(&()@~Wk?I?%mzemfYH@j>1eloJX+07EX`fuT>C2#+)k+p8_;!Zya z%Ta!Wxq8RM`=1#Z?ZU1V72Bne^=o5yas)^Tgb8oXJKip{_dH^OKQR;NmU?+Kxn?{z zer@jBqlt>~xuvd*$|JiyNtpdLsaG}&$rHOh1&oH?{BEU}^G@tmfa}!GbTr*Q;+@&(c@+?|UwVxY- zTBZ>_KQcXM&jNLZBeVSiFf)691L~Wg{)?OD=eGL3@L2cxowqPDH;bQJ;lEEg&mP`O z(wg>#JOJMBqZiu??U#hUzX$A>y_Z3SW4+u6rT5D2+|G>jcx7}9?Q@{-%b@S8ps%gw zOk?-BX$%V=fySWC?C$Zucl={b?KwPu=fLQfGuWok;nk5y0%&2h&v726v0axrFOJMn zepK+zeP-V@KDqhFq(&9L*YNuWe&^Ba!XEg#GmTn}wYx^b_lttp?+)_)bNk^>J_2@x z!m@p;amky7EA*QCb1do`2N$}ie+Pe8q1@aq)E=GptIl-?E;O|`G^XoN8(k2C6Rgt7 z-w*xJB?F<~rFMZIB)XEJ6M^$D86Z_jSDl-F*ivE^(za9wCVEx_q7v06AOl6+j3KD@ zs3Z9rtfV$W&D5wYE-RgYK)k#_5gnpCzleif*}809qc*bU{(9f zn6pGGy{H9W%UKLmhJtQ~?~Fd+8vIgN6yFuA)d_-45hqx_LEnM54N3&sA>rr`QW$PX%S8b!(<^M zTjHjQ|FYEr1t^O_kikHFye_0(gRDwgrt2d`K|h29F`hD z?{7Cd1JBQ<5ONM3|KI2ozTx7=Jdj>`i`UGb-;TNuBI!rIEx=r39f?_BGwyd@~tq9^cP{_ymh&>K2 z_XFM#nkqGg^=Z}z5#Y+nzV z2nxxw0bKbzL~chf0j-266>jGDQvaYwk3!s9?;(?R&@^0^B_L5PmpRjYJ5QMfeO)Yf zk#+JVn+2C)Hn%a#AhUxwq3<%A2n=*+dO$sldA*Vv$lx$339*WtBpWJg4Zzo0>u{+t zw}W6-{ha_)3|$7C8HlY7r0Af^44pWIPvUe)1)u*(XdmrRRer@B36LQbp=RU~Ij&B^>U147C`&u2!@RXd60#YJ!ejv)q*elS1+BU}lF zzj`K)WBtAkj`ul$^H4_kTI4&zc>RVB*H_&2aQNG2BRAgj>!9~lNGLfFBr%{Gv3w7w z5y0s}D`W;?_?u^9s5^We#O`C0fl)uTBMo=0P6hS&tVHD9- z{zJGeuo_n#aNM~WNNehmCdqn$l6fn*BdZE}>bqZ>=j%44x1OVnD~fJhYBU%?H5y;$ zz@4U)K{Xeb(j-qjc_0-O7c|ItQWHgTWnXX}5Ti0yb91vpSs4D+A3b5`69YHK(1Qly z2?D|7F|n*&O_=4bq}X7j58K*<#_oWH)!DakTbP9b%oApC9Naue)cu26oKpx4zCa*r z{DJBDYhRkz;@luwd-K+COI9fZh7e>$l2gZ@Tv_MEHfp-)I3Y>qG8u?aa>ogfgodX9 zdqUz$+(Q)Td`OE0aa3()E=O?=9U|)Masj6CCv8Be!&pZaIi< zAs6h$xpvU%)pN_&-(RX9xqfG5>E817m8Ayn@5fNs4Jb(rwiCt^4WtFv5Vd3Z{g8S4 zC8YutPpmc8J0M3VgoHIHS-pTV2>%%RA-3^SobLo%*aIseA`L13agI;pGs_<|?q0ul zAK2c%yL99Bt=mgC8y{W2x6DnsV#2-p1fAX<@u*LVnhX2bF$vWf%&UHcYN&->;~};l z;FrZkXi;uDsB@flesu>HJGKJoO*pw5@#%2T?+1vu8UwvsC~Jkx4&#D&@IJ2F@b)p` zrEPk;+}4|GG&lU_W+Pc6Aj?2?pp~#E4Ob^=jtly6d^EuBXA8N}Ob=nJNwye$ufc$Q zzx@SBH@)Z(`e@ zXzIRZbd8Ik7FS1X`syTJsG1s~Q>ar=v?2K+?!y0y!l-FOQONy&uK7y2V3sWYrp>aM z%je9TRkm_4GiCf2%#%sYGA(O5&$bfG(6j}(7JFOfv^j%5{3o28yv05hvtZ*b{O9mw z9?!}gNB#>|#j2p^v^j6i7udIgF$rzPoXyPw7N6oCTtS!mofHm0ZCb&cB|&Tg|DKB;9u-9?P+?ZTrCet@XzdBw%H@Ifgk)1dwH0LQoHab z)QHFdMWC8fbYj$dETr{)wp!{3xWu_8_v*_kIx^6rhQ-n3(eM~tWQ2g;@# z5&8r7`dJIFerY;i=B4q=f^J#PNK1JpS>M1fq^ymMPiA&OoHkg$XZ{d&`JpAAv$>pj z9njD;6!3hl+9miX3VY|<&>Ylu`;z+Q#I<$5hs{Ct_K(SQ4wN&7-#ok)@O%@kX z#5SVexBwG{-Nb!=151(QuII(zB87nCM|yw1LEhjPhP>b<=1SmNy&YH)kd28OV{;ER;6mGhB0Q z-9C*P78=Uprme2xT4MXd1#Da}VuRdGXyv#JaM+Eeu@%c&gwMhg1aUbz7l0T$&Pm)g zb?<`@RvIhUzbD=jgf!^mAt*r2dPqa5574$jT9i>SV5ew@X3XKSL-XdyV-w8`!T0f= z-u*%d3LT1{y_dtOKNx`q_Hdw8^PJQ=o;C{5twy6L^^%km=uy_YT}hBC;X-v%9R*}vv%Hr0aW z?I!B+gm{gdE+X(yYp6Ic2VUw$#NRyma!3l|B13f$XT&Mf+CUP_;qh@9NcuFXGWo|V zHJgY#tuZDAwMnrZetT?6hmN%%2zu7^^kTKO6JWx1Pj_4nV_YUSI_v#2B27dLYk2ZW zly#>pjlzx@))m}DkW}tWrzw&vV;~&8)rV73k`+(fbVN=#EgCgeedOr;#VVtwV~AaD zr$>-{Poj|2Gnfbm$WCL{>hL_K#vuJXn~FI+GloZPtlw9k&3Ibl|2-Rb4MHW8Q9}?W zL0Rb6qiqbK{KA>h$RxpxWSMJY#c0on)u`jTd`SIIL2>iqf=Hr*1ySuQWICM-@kGDHqrps zZy;sC&|baHLWC<9`P)%k(5JHzUCW_>2J$@($z6GgTCPtg!L+aBl%I@IM-EH1`hCU^ zvDu2v;4mjz80Rm^2}W8BbQq&C7S#R0+GCF(J2SaTX}oqf=t;<~%cJ?B?++>=x5g9qMDsENE)B^Secq zh26pkJpqYNir}h+cml9+LdoQ*AaEHcJ4VI5Je(i;1>j110(M`ojFS2dMw=w3=r@Q_ zbgp|Iol;HJbdyI2D~U^3!iruEbB%5gMo3gNu_PMF$wBQ((uIgC@WiwBqClFWwSAcNzrcgPBI{p9#b9DUn2iu0tWFXGV1C(}MHpYPk-dIlTt_`2 zirF%yXOb<`|AZSb2qfojfzX9k7)mbH{@> zgJ(G2)v!^w8#qyoG9MLiz^sVCbQ|v(QkFdULJ%x_lxyI_7?gmqTiCvgqdzuA{ENFr z@9e11)+78o*4PuHqt<|mz!m);{t7-ab$?-a{(XKq5pq8=VT|6xW%^UO>)cLYGz9;t z=hY+^E~|a{mNpd=nzjE3#L&zUHf0>zm!$ek1E+MO2EOKisCfuG5VHILuhlKTQ_rup zR2bFsI2Y0J@bOO8sa~XzK``S2+*HI-huQsWfcVd3HxnX;Ei0U?1#^)HLRr8npE1jM zvYr_oZe1{I#Qe?h>^SSkWpgE4`*hH#2W0J2%ZZVtft}`xLoOi~iP@#L+`bcmp`LXT zBnG=|=;}AI#_75iARt)tqhdRUcYlJlU2Ny2W~)C&j{?r9<-nw*_OSL5oL{ghKm0Fu zgT78d!~7zsha1jAFbEJCI<|T|-oPC?CaJl4u;0WLy`LsXa?HsFy~7hg$a1^uI9;@+ z-w)A*QONBzWPXt;hqcbe%pe^co9AXQp8$)M6h~_>DiIt{nqot~- zI;`(oi@b7s>%KP99v{);fJLVO>DCCfIy@^7BAE!j=-O8!ot*||)Azk({{t#(=&N7i zrPw?yQ^<;=?@F-ZuKz71i&h?yF5+9{B9S4adqWF?4KQnX;*bQTZ;pjDv>hTlR8HWo zk%YCTdJ}d*0$XT&+@TKZ5MGfhmNr5k3)c8e58IGX2Yi?E!v#&-U&3!LG_>>Dm7V z{i`~*++T2qlyN*Y_F=&5#F=W7R`_s8Hr6wKxnZfw5zj^kvN(^!{an4 z$w?Mt+#Qc6lQH=0Tf~Ks4Rd(vsZ?UGTlHcWTNl{E%v3*ZKC=^~SGbIV7$MCKl@ABd zLgwi->@dy_z$(Kb$2AX^n@+PCUWU6~5C^<4%10)b`X18caOSXp_v{nQQWb`sd&x?2 zlapvn+orqP;~z+7ra6=SOjcL)3IF81L$>K?v^c-7FJon|{bk>4%{o%s!hb~B9 z@9&6B00;e{O?@gwg?@gdsLZLhk(Y`ZvMG&hz(hlDQ#Be|z$Ed=0lz~o&ZA-!;J!ol zxgEoe8s0q_ZR%VmmL9{EW&Yt92Y)L}4m#YZV!j7En&k%kU~j-3_89vP4}pF1q=;S5 zpb1RGB3%X`wa4N*3rVp3h_xT1h$~P*QLCvHq-07ZMJNe3^@!anEOuCoSnRT(DU;Aq zFEpA6MXyG2$*DhQ@fR!@b0}I8p?y4+5UJ^RIw_&J0FbNvr|cv|gN8TAy6`HB(s7U~ zR}wphApNICvs5V+D`!jPQh6p{$(3yUPn0Gq&zH}Z7Rx6}XUh4K?3p$3cJ{}Wz`xKz zaqwF}`M3gJ>8GW7PQIO^naRbo`^zoDK{i@};j1^Z<0WtqkU5bL=ER$kFN-BqQMD}V z!7=$ef?v3Z0=Yn}zH5>^^&~GG;d2XFUpRz6vVNTgtGVU64bfKj@X~O51$N^}bdcZY z20t91{c|eti(pZFee_s=jnu+tJ#rEt-Bo={4rp@%`c&XPWX$p4p@~{Gzn_9v{{*P0 zf60RKlPJ|V2lqCVUDT9=qRxbk?DrvMpN^BGvh^?u z*_a2G^}hnBPdFle7*X;5tB$9xpz~K8hg}~W7>DKlz}Rp>)KfN$(K0@Y~X zU_vALZWlLPZRr=#`We1m^O4Ae!hni^6M;3j1emd-3~IPZZ*PsUQSGC>H0n& zlb-53EI4D~2=sV_U_sqQL!1|-9hXFff0h5)|-}mchWuG~A zW_EVwKmTiHvu4hm_?Q0qt9?UPg+f0LHT<1|_$9c!xDg7eP%@;9q@hd|&Y4LQp_L1d zSV_wWNnRuwkuaK!O4yWaQsHFm@B&FSC!1A7MURA%@japZ7Axem?2o%Q8zKIxWm=Q1 zs;OS)v}&0fbQu-XcP3;9&VMk=OpK#%>~u+59;!|gcBKVQ_aJ3$3uvz% zfnv_?Efh!X{zAU!*|}`Ln|2RhWbe*rGX>?Wus5eiMzVf9-Zzx3-?lS_{8f(Ydv@CH zKkPf6?>gxb+b`6r+5V7QC=L$U>HJ}P#7XD9c;3l4o`<$O?XA9zwuYSaRoUEOyFcwY z$}Z&5sB-K9w=iOFPrLi=o!Q)=n;lu|u~9tDOXuQS92DJ|c0I?HW)2qI{hr;A4zPh2 zY~O|lD*L&8+qbQ-`-^ytZ>xgi+4+KRyUsy3>pSS06KBbSD+SqW*BQt;86R(xcMjT4 z9zDtC2iKu!+Q;)z!^RsrXfm5qj*G0}LVssG2yfroS0TamB#fYq{U0r4^Zx35dc?_K zh~B?{ABFdp5llZJyP{)+FCNzR_Tl@*6+Qzb9rrupMCk##ru|ZF26XzNayms z7jGz?yMC{?e8ln7mtV1A-#P5B=lIBaHm=JTFWqE^aUk%dy5od&9{{G76Srk&S>mUqNHXR$564w}+(@=atOc=F zXO3!BbB=&Sl5L2$sksuLi+H=5C-HW~Pf;hR1xE~&nupY>>O{3rQuC2oq)w7@C#aJ# zV5bH1yYqh5KipgF_Yh*;7xJa9ddiWdQ(F)qqpY9Bsu@5IX+KkNUB??O-o>5+Llgh)0t#>*HCjhQMsXpDsqZSh0CF>H?GwaVc|q0OOvtM{5?)>y<3 zm#uMQELyfy)BYy6r)&;e<>q7_5+7sdc4oAx7xUn!a)`a_p zA02KgH;o$uu%bFsESFR5yJ-qiS{ zFn}r7Nk*JQzMD=q*BL}$kwYM(Hk8RZu9H&EKsJw&FXU$|kKp9gXkoONOZ!eLlS_ME z>Z-JxP50*+>(8Xj4%=OlcUT_NQDx*92zD=1ZgPHG6(^pxoj@vLJ#-2 ztX8s?ob9bkgoBedJ-2ID*QUPiU8%kuDYE%gcTeBezRQB}AQ)z35JG1Z<-(bq7sN)= zerBl9KO97bq7Tm;)kP_2m#eq(^EFne&a(^4Z33_KLH=BWH*2h&Kew z9|zNKe8bYnZi>HySs%g@f@l!a?Ap$uQLH!*q(JdB^)wM)cJ&X`=@v#o2oP!%Xu9{Z zYSUU6dZSJzo6b!w+0MrCG=oLc#uIH+htoX^eNKekvk|!Dr7qXh%<)^;IDUuf!#9Yc zdibn%qm=kJ!#7p0K6)JbI)$8%gd$CVLiR`9<)96QOH_C)j1k4Xa(LVvGxC-)$3Qn_ zV^OGVE+X|<>E)oF?dkmvM)s<7uIShh&IKe$-xze!7kuB%LREmK1BT&c`=OCIUM4+C zR+sfVa!@>KO%*+sR|QDiYM$3v)!ta>ht{wWQ`@n@UR+Nt zZfJh%0D7jIr+($9vw3fNiz}$l%=X}^KC&CL_h65c1M`9+Q?M70Q0sCRvoA|$$zS^K zdevV5wN@WRg=^Y-ub4)p)9=-LFGK986{4?E##CQ?`rA#t*S{Oz#-2pLfKJvQ;+{?-e+Hlw)wiH}HM!1cE}d}_R$zHfZXk#VB_-oRLn*G?ptV{?Qpc?l zO>C)jjnKfnL5g`iMzq1u2^JZ4%re?6Yxeardkb4KTqOKs$ZHcWZ!w_CQbWU74qO~% z(>)z|STJC(SV&kLM!E%yV}=GG9Hb`sYs`bIq%No$srsT(C;}sTWnE-<)us>xhIKK; zvs981NPxU(LR{YL)EXBY-oo1zD=Qzb*f(juL#ouD>GkXuTxUGS|sNT z_zi8ObA>#ZNzrQ@gB00Zbx~u5YC(g6UX}n$@_Z+)8n{JuTt~pTQZuLiBECeQTf`Hv zY9JksTRKx+(RH!JXdw+`X|3h$IvMJD>#P?CfiLLVzO#E*Z&%Oe)Q(-LjgYj@>+TD} zm`Zm!sLEXls7Zwm#Df!O?c>Z9&^AR29EV#_)1$&4YKt0i(=y^l>5N&V#PnMKm&ASt ztcc#aw4V}}{}Eyr(yb4)joJ9$6T2`7`n65EcMEN)e%`9Q%wK= z8v&lYv1?a%YBnl#SAkw;r$To%Z_xtLK!Y+^Je_)GTYGOsIZcOi>vVYDzoo<3TB>U< z;HXoKxE+bXKUYc?XFF``?|GTNDl+s=DSZdQ!~f(!xk@b(iwDR-C|ic zdQ6~!O&<%qm3aXf6yfp<;lg?G|G^J9ikV&On z+Gj2`isVU+-G=d1eUT)yViB6im>c^=_egsCMg zF>wsF;szV@XpzKB%Z#1U8a566i-qG>+-!}sHnpB@8m$XVvvq;X8W~*mKMvC+xV+~A zrnzU@=u0?aBrQ5*;HZf>QMzTsanj_(R9v-4yczMi;*x>>l#DAA3%NaL@2>g@H;bFD z;HIy@`tL0c4#FKt&uG3l(hqWn_E5Df9IRsPTGYOky7Ls2SY6jU;Q7;4uv#32^3Fw? zF|X>hYY+59jS>B9Ae%|&uvV~Oa`pSQ4sp@>?qPc%>w22XmvNbM5xQCBKntn$ChZQ2 zFHvfs2o@9+@?yp>xQS}dR(IN6RDyIBTRE6mw`#cx6M=hFyMno{plv`|UST^}wRj;E zNNb&s+N7?V7KyYhJ0N_fv953+=#a4192bT#Z8az{;3B9k3k}pM{Y|W(8<(xy>*>GS zVEgGo`)S&g+Q+v`;)bQ78|_T!5nnU&jp>a=S8Bd(A8sA28T2ciJM4jP_8%5S8E#vx zo7OzJ(#bUR-jS>}?(tfvQ**Xg2Y79#EzGT&Jg?Kho4>hSIYo69E$C(+zI7P|8#+5iW;;mc-#=8-{yr z+}LX(4-U0dOq{IYp1K#dCf0iSs09E%W?w5UiQ&+A=qB@k)vK2;45mn~aL{q#_cc4u z4U9t#tC|aXGQOU^Ht0X28-B*-*mg8j>Zs9njdnXzDLTVasm}Ge0%+XZfJm)oqB)p< zJQYJzg%fC=b10K5Hd1%cPU(kPsS|wAl=Z~p6U?j8@zlK&X^M`fNRX)lmt$%K+Yv#O zSR4)7y*x#(s#i!v+!c5UcPn8dVGCh1VTz^k<;i>XOFfR1w*E{w>Wl?Ol)rH4?p{0!dF*k{r zrs&Xc(>N*GLJn(KG=<$!#;P@9Qsa8R8F_Fwlk%%&V^a1gQZbf2bh^R{n$f4bt9`l~ zE@_Ttq_qCBtdj$$YGC{|cF8>?Xy_x^!6A5XuxVBGt%GT3vg+)aj1>KWjteGR4>qiN z=ma}3IPOkBBJOg|HHmS-yZ?vq96+!To?`_EvgXNk($q95=f&6Xo&?ab=hK7F zsaZ%;9RP9qLa55zA94FY+=dEYXPykd|9(@_*jABrWPjN0Q&A8KN+^SQa0A}mQ2XYF z+D)jTahV9^wj9l#j`=28ng88(ndmqI8E^WTgary1=+ zR<%&4rVs?SS9Pn7tc({hCI8EtRwMnk31oT8h2crsA zyF*&3EqnNOv?}EcX7hP!VNgA(@YS9*y&2JYW^HGNKDELgDvUZfC_tM`Tw~ZluX)A$ z|MkOaMY81dSx(5{>u=-VeHU}T9J?|Kim~iS1d+6t$!6ViFi(kQ_c8=QcqDx&2t!v3 z=ITuXt$AWYEr@#PMh1>o*|nkKC%Lly<8j{{MI8}riI_I3L^zWXCKu|u;E{fon{HL5 zFB#VlVV9CE)!iWJ9~o?NQEiaxycJpjwwGp7_w!MB4XOJyL_*DAm2hw^#MK7L2xbXZ z83U!qjMCEM=y`S?PBGn+Ft=<;w73-a?uCSl2VLJqj1=zqUNqAaJ9MY zR6|F(GF<8-?y~`3z$H>V21j(rh_}Szt!K2JVj6NcgS+*=7a?{DE^ilD#xzR`8{Nt} zYQmC|&Fw*Kb`OSVOAUqwEU}{uoT`yc?PSx9hI)paNimW!#7|fA)O=WB%}!i4w6O$} zEwY&%!g(6m$WG2dxo4_Vfi|u)a4p7lCa$w^osDY=uBEt^;Y#3Iu2uujQL80ffp`b5 zPFyQ-or`M~E|yt?Yb~zxaGj6qS(1AJVi&6CAGMQhY8N)wdsUy>tu9r2)C<&QDyc44 zd(}R)eg&TZ(VJ3N;+IzaDx;Kg)PNdPL%5Sw!>GMq<a zQC+1DszZ3Q?n?^&0hB^*VK% zdcAssI;!5NZdZ4xJJnt4Zgr1(lX|ndSG`5ORo$oVS8r1fsJE+ksCTM&sduaQsQ0P| z)%(=@)d$oE)rZuF)kErG^%3=m`lxzTJ*JMS3H34carFuHN%bl9Y4sWPS@k*fdG!VL zMfD~1W%U*HRrNLXb@dJPP4zAHZ562RsPC%psqd>Fs2{2ysmIli)lbw<)z8$=)h|>< z{Zjo({aXD-J)wT9ey4t~{-FM-{-pk_{-XY>{-*w}Ce=UGKh=M$C)HCJ_Yf`vmx(Kk z%fc1G6~#q*C7m|oisNd*)ryNW*><#)Y)7~b*Lqyf2IbFl<{@^{$!rstoGP<23Tj+o7D zG?HN;G!M7(t_qDs53P{&T$lrWTWOx>&^m(%#9$76_goGs9x_Ze z9Zb8_lAt2Yu-}K+blr##jN+_{ECuhC6!kQqL#9~pu;N?coNNovM=XxZf`gL6pN4VF zh?*8{C{Y4v@8gP~j8R%RZPC|#A9c%#N-borR%UNRL?3V@oKy^hSnYmb`G-dx zxH*b&*(WTagCHV`UD6^IY45x!OakJ?Y2~?>)QBfUj1$1xCl-Po95?2hVksM^o2SF9 zDV;K7UK=VxiN#XM_;tq)AHY?+377W{0A@+d6}ogVt2il2Fm|RCW?QzFG>@3hr*J0k#A%F+{zx3_Q|>6cxQj4-(%k~` z?%`Z7Mg(dye0CO$)~L)n_psYcGu$StzCGM-mQI~9^Np;hZliKp^O7m&#s%$D2&3>g z2h=FNux2f4VJ$uY8i+N%wxY_p>0W{e7l}@ts(H!# z&qoBLggx0Q(&i}iz^5S0cmxH+-Gy!m$DZDSU}4=xtD0|``A@J|)ORYX1-i9=_6-k{NL^n^RRs~ij z>t=%R>P;vH%N*N3EVc&4dNXP4YNPSj_lK)%&*zcj}4Ffz=PcjdTugkymMNh>2BeAm-|cwUB`PumL^Y zjOsY6jJUOW)#}v{QuU6m3Zgh#RqJS15Ggp-j;_(Y>}D^!1M9NlNKWWCVgHe<`x?O) z30^BWE_g)nI>GA&Uo7|%!IuiYOz`D`HweB$@RfqE61-9HCc&EpZxOsz@YRB^5qz!S z>jZBTe7)cs1dj^7QSf%bJCfn<-Mc1@6@f8OGFPl9g*!VtCyjGT<~ip~8dppjX9q^F zYvUc0=8o>bNXqF!IY$^6%iU{nqhxk;Od400tYv-M6Z+eSGo*aM35-%;TsCQ(5g3_d z_&FPQP8w$=!6?{w&>zXhKZd`fmb z9>4Yh9bWT#(ON!nb2WK;HN3YP-dPRrl475|zM6cOBtKre`yrjYSqC4f7J6MZysa7@ zt%h%`hId!PTdLu$`We+?NAA|G-BrE&VV%512ao9BgAyFOQwO?}$8^PG_vp+w>EO*e zxK{^n(ZO4FaGwtD*TLI#@PH29u7h{z;GH_quYc^_I`tkMyjKSg>fn7kc)t!lpo406 z^>7{2Z*=S-eeYo%d_)KO{f>Q9QWMwc;6*yPRtMucIHH5=ba1^6UaW(c=-{O~c$p4f zu7ewN@CqHgQU|Zn!Hqh&Ne6mbC-lfq=vkV0wZ8Wn9lTZtuhYS8I?$vw@dlkbs)IM` z;C3C{p@TbJ*jHMAp3t*5p=WPG&)$Tdy$L;g6MFV0^z2RO*_+U_H=$>5LeJiWp1lb@ zdlP#0CiLu0=-Hdlvp1n72{#a4Nw|q{GvOA(t%O$-UPE{-;dO-D z2(Kr+fpC;?JK+w(orJpxcN6X*yovB;!o7sI5Z+3-k8nTXZG;C1Zzs?yue_7+F2cJB z?;*UG@F3xRg!dCZK=>fxLxc|#9wIzU_z2+tt-xm%&!tx)b(D0eHAyA{ga3gvEva<@XcTcO;o zQ0`VJcPo^;70TTT*5dKQ|8{zMSNy0w}|0Mi3;Yol?tK5B*u$#%p82b|8QYOF5 z*w+btOqzrPfWX48_yu)Kyaibnu1&bSvrrt)EgVmvDK~6jU$+fsXioD@I$*Dbg)Vt> zk#{cgV$9=tE7gph<7PO5A1-6}n1;~)(2XJYZ?NgHFFe-dTjeG{0@v~YoN;ss!{rMD zu?Ytq9*|gUzu`V`sD!*`IFZlV8y<^|HIKz{FNQMLB5X#9YvG`YKMM|LxPY(S6FRT~ zj^jA)KOt?z;qG~&Yz@O*jn5~eJjx&j8FT37fh$kaQPsjW%F(e1TBxQDY=BD{@30>m zA+8v9wU$(CttB-FE^oMzV8FGfzks3Il07u#;;WuS#y1dkcY^KU93=O|cuV^T%2{v}3@thsY_u*mL{3t#cA1UG+FL=bK9nGwMII{FAf6 zLktq8a6-rRgD{UZSiB7%ht82Nn?3mcW;lt0W^A-z-w)prE(n7guIDh$rEAlKyOY}+ z*v5dvi0)5pYrxG7$2T@Sn1Zh*JRhHZVygosvGsxc(xT~}mwM}PKT1%aL+# zV3z5=lJQsR_>GL;q~p{C+*@?~R>ohgiGK@f4`1@fN^RIC_{b0rM`gpLySMHMNOPJ-8r+8<-{kI|NuYSTv8F;fLdvW*8~6%`h#5R-3uN X!lyIXp$1b2^1xdA3qz;2z3_hkhRU53 literal 0 HcmV?d00001 diff --git a/bs4/__pycache__/element.cpython-36.pyc b/bs4/__pycache__/element.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..27bac8df42548daf170a7ad9b46000aafb6164d1 GIT binary patch literal 41980 zcmd_T3zS^{eCL@Y&WL~1zLdl#5} z%q;GmC4lkZh!qrzQY=+=lG=3=*LH08L``3=)3{DrKTfNvPLtR=wNK7eIZgaXBc~@- z(sOcJ)v5dW{{HvgdF%p)X!W$`bb@ie;%ABEuQNOrS@^ZJ+ zs&$)E^J`=4TdLy+7rRtmY~^{ldz$SG=9B?6R{_&St!l zH}-{8<-V6w-ncjMg_Jkp@4lU_?BROS+rjk?yWY$7l(&=Xo&H{)-_P~5w~OmtcKra? zGv0k%-{(Kb^M|jQTEIM)ZgLtG!C z{%qv{*N44Xu4k=<2f2R2`vBJ;@DK6)u$R4*nmy9ZzH)x?I|6{&bhkKPTU)DLUh@~) zXBV&f<*>QA5!U^8{PIR^X|;C6FW19zquDCg8)1vAzBgB%zjUb_`fL7Ds~LplrCLMJ zFZ<=qkf*cR?&!H@uwHAm{GdDW(ZyFT^e)HW=(}p?H(Sd`pZiXRmb9~PUitt@_<<#$ z(EfBg$C2d-50bP}Ura5hfuYP7nBiOLPI@J6ce1@Z^e@*wd3vp03(L)hU#1m7{qknZ zFE?uIe!0~w*J$ijKPWFZgYsp%c)b>Q<)!BOMy*x9TwklVZYC{IOH}(VwV-4z=%n6D z-Of-;dV#?kx#G7}1E)C#WLdwu{_Cmly?yLzbKO5yuQiUHX)bN9`;At3>a`?0kcIH;D z@kEERxqBqKo3XnyaUL(dhxhg{=eKg5bSGEMbTXvbPL?#+$KQq zwc%f{R=b6bz+bN4=;j1>-BG31Qe|9P3j=j5*rQ}GNjKZPyy6DBH`@2{;7N{fnk1Dj zrYFT5ZPS0f-nu$}wH5#;mo_hlt$J&- zRc|)hFPtueSsLEadZWxJfK_a5*2+s{t}OvmWxui1^y-Z(<#5AasxQ}l4|G%(YMG;( z!GqKvJf!4d63v0vT&h;L?bSYVyujgll@4@0&?~K_Q_4^#8SE9n2LS{O^Ze$$0=*pR z-XHbp71qvRhugn%x_rfN_(6TC*F&Kl@K5<24N1AVT&|VZd9&QAT{#q%52?=#(4ijU znk$vc{JnnFZ+rnd6+`r4XsxsWHVBU{LcO-nduMJ#EA^p}l}d46r7#*2o$(;{bpA(K`-$ya19`s80D{JJ;wW-*`36!U@ZEpa4&S&8#t96$X;!=Hjet&HtX-auz$%Fd$e_P!`}~W zwno3OzvbU(9lP3EUwhtA1Iy$E2j6amEcUM|2i{wncMss*zW?yy&vu@leKUL_UPiNr zKl9n=-+E#;fhmEJrea=G@hnLvwIZUDhAjc+ji8iUnU!pdbs%;@=Ml-tvNE!e4zXga zjC{8=|IyR0UOKy2ef6w}Xf0T}+FLEQEI@I|66xmQtjKYvmGT)iNwhY_N-1W>Gtqi+ zL$P({x2zZSD?UOY)gIGD-&{ZS3!$V3*@Xf2LW|9~rAbbd*<(oFOYKRKHM8&?twVUE z{FMOo1!Hi?u7myjxHVx+!bsp>59%#{ncMNA``{fxcg`)k9@7)O4PH4w0g&a*%Wf32 zw$@#3PsCNy3aef<2al2X9HH?L>G2GU{f+|{FZR-Qi@lo({O*X@%KDn$O}}Do`BN~! zBROb&u@va;5=ZhUzUCB1IL&DH;{fkfTlZ;5lw0?CZ^SFI{tNzyH|mwX zkU|1r2pmB~7l`<#qZ+3jST${r`vRQ5VqrgFcRsi(Zma@J&-bmam6fy#sK9(Qvl(WEO6 z`43edPI5}BNmusy`znw0<&B?jNW0lAIav$-Ia`%9LpmLDo zP?CGjJI4LP{^81OU+yQp4|4Yj|B1>6IF9Jaa>jegdz!mP{iBt+GAvB3yixa;9E$*2o?Tcnb{??QFM#QAUM!i*MBm#fUh9QLYXLck0c}uo`9TjyD*lp{uZt*w?tympi^wjEG(cD5HB3mg+o~NU-DwYD9NBd0yqrM4CzowC-dGAM$8STsrKICONx>=4NYS zvo$5LfH{2CgITKQb8N3rGE z8?U+tkVy^-jwEndUoJOTkzrU1ZbmYLXRPV7Lm< z(yY1BND86NcP&TqpvI@RA%#Rx*Z2kgl{)1z8X5@w&5Ckav#b5ho*fCEV+tLxja2Jl z^~U;IAVe03*mjF(Q%pxzDZYC4<@1*oU;Q-Pg{{ly^*p^a1ZBZTxcy@sCK{(kOTZRF zdnU({1HSfcg)d+~f$af~N(;b%%vPXrRG_^nAwUB&zzKwS5JnRk>t%1V@_3eB(F)Af z(BG!c3-k9OvlzjkntP;tU9zc%WNtD# zSIoq{gMdn3Zw3Q%H&;I2l421+O(w72@RyhCOLZpjNEwx0x!x*=SDTw_p1@gC1n|D& z2RF+BNO-Mp+84S7H>c;oE~BfB=Jkfnupw13Tfs>ZI6<4zL5ymTeqK{4U0AAE$`m@$+rdXlGLRi1Jk zG2YCvVOPlFZDd+d2x5c;49>4g8HY^kOE)$5iY(G(1s%+MH>{#7|jt z3}f^j`W2`3O@PJVVy=B)sQg-av33QiijwP)RY(5jX7fg{x+PQmoDG6z(9PEyo`0h| z0vVK6hbd`G;f48t&S3zcunY+WRfWHisg(reD zO3spWi|`;m8X_-{h#H(DiAK9J%9y(04L+)@`K7D%HE(EuyM@ih^`N%#56BVeAjL&! zlr^?{(^Kj8bpPnO>+c`xAvHAfm(V-TQ%NXvjXfD0{@9GnR>o!tTm+;4ahdaaD9 zer7*};&n6Y{%{L(dj5~pK4U*=9q4tczfH=J%(PmhLjx8528mi^93~v8N$|-Gzo2K# zu$dX4;9TV`=Q*8qG9ck|;%Qg%aGH*LecN60eZQw67Hgc! zl+x3A79_v0ps=1=oMKB$EKl82I*d>wEU~yfH~K2YcGAIKhR02xovpxmHXiS!Ze?%m zgC1sEEWY4)C%ck!sR;+!Yx`J?nekMxAmIZ{yWqtfYp(s=IWUcrl+?#$X`7=uVrHPtdL4U`6O zW<1JV8|7(nr-hm)IB92$CZl$;m37>{bPa2{_57`DI;H1Ab-}x!L(tyJh@ru~h+?Ug zA~Y>$G%D@oKg@!<_V()_OJs;qSmu(z{5CY~R=PDti_l*NzeNe-px#k;X07@r`6jI9 zJi=jmH5GglRyzNj7HDQ=(&!;Qs`B0~7;7)PnhCz^>CO%Wz)W?jaac9!oxLf9#6pL5 z@(kH!Wk+>qM|NY{p`6^AQte+$2S@DrRCO2k?D;K9h}*oHC&$lP9Xo-#A~lcRc&alh zqQR3N?r_IS;cR8+tx?kPTSa@WvOje(^|kc1PXUd3LwA0%Guj!c?rIb$Q@>BR7Sq~N zr$ozk(u$o?-fvjTcG82A@^(g6c12JAac4wV^n?D|=rf1d%7rY73+;Uo0dnNGydXt? zy&=P>Y!Gv3yc`D+J$DknWq`l~pL2n5I$FGmmtt##j1eXV8&)|tI}-@OI-Xj3|L%?j zQ#)W~zGMPmH;XMZ&?@fcFnqRp1ge`|uQ$5cHNVjvxmpWlOm(O#A!|GFpkpHCCKonjUYRm=%jVPM22)Rj|rz@rXqMofso8S>T>Y964990%#!8tFlxKN z^Vc;vY7<860gl3P14*;hoQW-}$!UoBEj5TJXcgB|coNoon3 zi2u`lVpqopzkMz@Fx8kaW+pcRS?R7 z*9XmjU_eWzx)@N64ZN9*vN-Vs%0JJ=`+;(=02g=0ApOLx9FUG04M<+kfYtAwS%N8T)?+?rY2PcGwQ=KX@0gx3_+PP#?U> zqy=x0#DHS-Owba1mgIANApe1FK;GK39mGp{ur0*9e)sbi#!mEueo$5QVyA=cKtEEG zzpdwjeAK_o%jmPaqsesM0shA$D7H4V)HRt_QzL!g54DXKz|4?Bq5tJ2BLciDJg?*9 z91)&D@I@;JCk&swjA`PlCib>hUPWCWM7EjbdVt7*;d31jsI|&w=pDe;OEQb1+!Rht z5U6M*LP*&Sq5;n$6gpCf;t6BUXLC4fjaHWtZ>nI_?u4D8GczUzk;PPB@<+n245lmW z=45W~7UUlXHW}n-(6g%`aXdc~14Am?zONq`k{nAK3jGN&=&)hw=>s$*ve>MomtpOr zfw>$H3{A8l5d#vB38%5;pnpW9{ADD11JhegcOMYy=uQ~>7NOw8?4kh)t|$@f*3GZg z*XwYwx-mE+!7}%LOk-g{keeR1B51tg;eH>0@7}vGd#P0{~XWi^aj-7#2s+YY{ za*&gZh9ke*jMqItPT;He)5(;^OmEl^IK6S&Zh*Qwx4iw6$#B5{w;$dXc;cGg7Z6?R zAGBz&U?}9(v}LqatFS1}&L*<|;t}-^_4MJ0}E-=~-mc$zVOfoo9HV;}1 z;cD25f45qY)-TAlZ9fx9Y1VoJqyflXlv=ZLO`i%fnOtPpT|Z2Mri= z^zh!QkbZDLdKu$}ZDP=t1?QXd`+e2S^v_wGAzA4~UQQVLgIqWzv*TR7Gz>j_E2`lU zKzPK%3Mln5z0F2u0rkb82#H3FcDI0=_SL56HUK32rDFzeDD%zu4F`NCdkG85h0?(r zuol7F0Nb6g_=!W7-c|2)gwOGQM^06;H)lG9)k3hZQ?Mpy_4@*jEblo-mbv%nQe&!> ze$PoQJhx7|x4+fNVGa0}SyH~$Dd_w?Y2AzARen|eyUtO*la~hTRuPNLNT+Ddu&P1U z0^V53q47_%cPkTooh$qn(ZXlm%zP>HrBtKnrN5MUFa0H4P`{Y|qC5!kXhtWtwM5FM zj?o)$+#JXLfS&ADi9Kd}bE($IY6;Ccu&93**;S?;YMoN>ZFJZ))KZ+hSx0(;WeKB?I+tSf9oulpN&Le+ej)jZFv|6q?|Na{Lpv+@MUQP0T zfxm2A>6cjMa6v%L&E3TWegzZ~7D63TW_D6<&0qxqFPb9}M$!SD6f4-o-%pa2a|!8K z)15YK2t}Eg{wD$zRgg<%((S|j0PcVUreGT>J=n!7CVp>esp-vL;xR2f-q+FxFy}H) ztC{dwJ4-Q}nfK780S&s7eJ`Us!~Mzi`r{UY^iEOFf_HVE#nh;+Oy=^{%hR_eq>mV9 zByCNoqa_sNr8Kqxja7GCrU`5g_&>54ZS++?++1tn0B8=@SiPH@IP2xZ4YqyZNyrNm zms-sYH_2EK0cDyGh_encKoZV5|=^;q4RgA46uC)+YtrgswReRV?*Sq;; zYIgQ^^vwU3YBAwJBjC1Lt#bBV&EN@6Qg9~Df?CK-WgbXR;TnvcATybpc9sA8=ltyB!q4 zmw0^`3fH)IP?ID4nKFwn?HOR!2c+&FM4Mz20)~WTw{6n~502hLn_^z+2hk=YMt5nG zFu*fg+BCq$$0R9{5`GZvl41XDWAzTtpTEa;{UFCn3cS0u>xX#$(w24&IAld!PBKcY zq&xE?g3~)N3ns|FtW8{6+8iwi&iE);Ft_ z9YftSSEU40qn&Cknk1PiL_tP&Z}E4$3j{i=^$ zJj9L2Wjm+G7nEpMQBYIjDOn+@6vg;20I8+WOa|>o8gtPY*ptLw zq?HL`G3>D2FXOhWWjeS-72VXrfHQ#D+j-S55}R<&IBd#IQ+^CM;7`U^$Q(H2V=v4( z3!=o)+EFOOeC%(J+edlGRi)41ZCh|R8yoedRc+fS*M{?In=OiEC)>$aGjd9nV`rj` zF#EXM27!s@%-M6NU%Rka{lsgF7hhYfo?V###F_I8FVEq3jkUwJn5n9U@5?PsF1krU z8?{jG`m*ylcZP72o%l=mt%d9)j`o4ev)gJ+o|UgRWrUL#bW+sS=5-Zv6|>SBkrJAX zqo*&;pFiK*fgNrA#R6YOabRM|3?uF~^zOMsCk(~G-fHh+H*cGLyXnng*uIaazOZWK zU!9)B-9=iI;faWPG=S21&&|J<3GBB;>7vCnL~{VS2ytAHmt|50x~81AvfhEjut{I-tX5kI zZOOIwZtcCxw0-wAcCP!&)_G}JG(mT$aO4y48O4(1Gw!9GD>TERluOMHSIQ{GGa!$<}}8?LNw+Y9$hd;6cqp#QkfY zh893~%qBxcR;#x(6Sfd?NO2iSbPDsJJ$s4s96NTSup`8B7{OhxzFu2P=0zA;$2>=* zRgmr3;X=)cqFL}uYO7@K)lIi^CYlm>iRR`NdAj1k6Exq!@!IQc`XIWC@hsiK#aGXs zJO4(vaOt&k{6@}F!Sm{$B-Y_hj=SAhRWPl0^F8%az>~uJt^Lq&|N65IfV-G@>zCdg z@YLB>nP?*QR}$FHv}cBEQV5J>r#j|dreUGjiN!Ps}?X`_Jeo}8t4)cnCnY+PXQSy=!VawnzDfxGl{JTp2s*)W_ ztfDEM^=h*-t5O8?GpbUU0WKn)Q=cLjL=4b(N7;?a z14o7KGz=FD&o-HPZZ)?5i>(7Dq=^@%Mfr$OW=Bo6XA7rXj&~U``iN2#&ba`V|_2C%5Gs~c@mRi=I2;S$$$OzgO`-Lc(A${U;7v6P@FVK6+s2KpeDV5sR3 zp@CM5Jj?mgq>2P^`qw7AZmgSq@L`}K5c(?u4XNr=;2zfHBU@*5u77Dt2b3QM*GG8W z0pxZN_)|RnOR7c-STQaF-5G#5t?^SF2BZTJmw}@+8q&?%*tSPcJo=gWGp83%f2N&% z>#dX9v?dvYzW(;s!%tJq&n2y4Jp{V5eQWmeWNT~SZ*xF@zI)%rped&@p!YmmS8X!} zp>6h^E?;{2l?y_o(Noy9cwJ zGD$ovKE@d@V`3AFMc_#ozPusm5yFs(3mvdRs*rgVB6UC~y}|Y(`3{@n_bE2pBQ5Pt z;Yo=nvE4hzJ?ypaUSp>3E;-y?*ia()X;mx)U!&Cac=_UMFI_l4U*3Np+~3Z-pYigg zPhVO*`-)vb{Cp$uZ-#ipaT@%X9`?|R5Lb5!ROw<|!Ml?w z{B@0*NMH)Tc;PV%o25`5d!qrE8-Ar-I`Qb4Ps}fV`r_GgJ9DzMz<#<1Ro6pG9wzCI z88a6d?f!k;zWe;CJHM)y2%qBPKh6y5-oOk>M}3MTJWkGd20g{kT8;yG_<1>fGl!VP z2=FlC7cqg2+We0CqX^AK7u-;RJp3^>&5SG5p$qHbPxzC|8G=|S$YGj*9z}l#Po{Xs z!W?o=cn^Xhx_ns+@{z7g`_pWaF8b3vC)}a8n|kl_u6TRAy_B-syWe|&JA1qby@&YS z>s|F8_V)4Qey{F5;*~jjzcpvbN z@LTqddUO0f>K*f*8}p5ga#??c|R{2uT=>^;ZtLGL5paefbZD;8XX zU>+{C1_3=RtOj8`K7i5hO(FxNDo+sVgK!=SvO%yBh1VdU2ceRVcrRg4REUi>^1$pV@zdarTk8ULBe zhvK{3&wHQpUg!R^{>I`EhT}d((T1oX>mf-eQ&E6nW5>~K>3lhJO9|Jts0SepxtgOtI3D6{Xf--~?_>H)jIv;~vR zlbu+ddb!Tvlu5NASqU(z>%F`yIb}{VHz8 z7Ok*K*TUISNDT&Lp>MZG6(k#&K-9#T|m!K`yY$iJ$O5F zJIn5nBd@1!Xru5bIi_ukcZr=hW48^it}WmtAi7s*zk3?ki8W8=5UCX@Na5m0nP?jK zqBc6uI*ym8$q%)pR0%QckXLc2`r=GIjg!ZuI3wu8%=0VH_Fdsy>0WMLLFjmF26ZE&hhNZkV-2~|)kH-ooIiM=G_{KoTk zH;tB2*(#s+Lw7_lJIwSZ#8gHQC=8NUSwN?D&6{mLj zsC>sHSn^=D7uw>qy=@ND((A;-Xq)V{bAH=``}Dy>@3R>xDPp~b-Z8*~|E8%4FZ)}{ zN@P9SWYI1>UmM@G-^pauXpOnJ8;G@0Oc>3)d&6QgCZ)vL)sY&pI6~w+{8YTM(2G={ zTSh3fppb;zZ1f49H5-!aSMe`*0donkaowUN*>(a{(pDY=Pgf;jBQevpq#0Xk=^L)| z)?y-e81jfWY28~>lM3j6g9>lDcA~n|9}nYZB#3HDJYGCS<^Bqx2KoDHUB(GX~xw_{pC0?i+aShR`& zvLcx>SGxu(n{PG=sxRfPld| zRJ3*?Z!MFX?ghVpBtVgTk}8RgX-jV|-7X1aToVVBjtQVh-T*98W}Xt6G6lM}e@5H4 z(YR#sT3Tb5i$lm_F5`kwZyD=ay8P{&Yvj5=Z!1k{fTRZb;h#q%C8N zTbV7v#IV%H|F9UbZqI7lDeO~bbtHK98qA<=KgFcRhSa>Ccyji-$djzOugPObWHHV5y0AB41#Ma*2hA_t1j8-YaHcv^ zzNs>WHF;r3y$dezB9*=i*$~bVH#Y%iLmZ|xP^H^yXn25uw}(C=z$kRbdVw-RXAYBc z)yOLDWNNo_7;}-yg1^PXZeG%N*e&&59hxlyGcz_(WU}z%nzm_POclm6GemvP07DXJ zF$qiwlKL|bARZwfo7H`s;K#+RN|DXXmbBT`W@S9`RByK!7w#pKke`frs`ydTSM^g3o976H#CrGzX7`|pcB8>l66r9FgKu0^ zBSwUXH64O-T$_y6Z>D`LaSvgw0mZGLdDEFk69OFXt%}0T=IFRegLEQc~&zIc2PoFaX=u3wws40Hy+KZ6f(`NqDXx5j%u1lK;63{DSR&$<9t zJ&+~PgweI6nW9&V!KQyiLDO9SOfQUWtOu@AXPZE0q(>5d@M}usvl9=T-FR1b2M5lE z_(n2hh)^>kLKZx}J*puy8j?VXR%S9>-{RtmJV4)#nHGjBcoSC&Fg)utVpRyMV{}Tf z4I?gnuN;3*UPI8 z#W6HK2t2hJn4N2Ft$AHV_W~@gVHIQP=t~Mw7*gZ2h}iFf-5b|G;8WoEl-cp_sNZ0J zFR%GS(1OFcb=NU5MxK63ka9uEc_rrk^)hGCexOQ0QRE31ruEoO*ShJ;-E?CZw0?t! zKP=FCoRd_^K#Mgwoy{P#i_;-YNoEp^RN~qUNsjNa#v}_=(56Ld3sem9js1PlHshd; z`IP!-Tv6n&L9tLY0S&kkxcU6|5(;q+hcA3;B4&L@{$inJb@u zr-`zdg&1z)%s=R$qva8+#jL=?5unV9?>4B=gd}wlwVZj4^7MJiv>(<=vuwxX$JPe% z=(Q`d)cra=ck`x!h(zn)UEMXLxE+IehNu65CcwO^^D_)G?lQKjt*(T_)A;I8(S|#P z$i!7jp#evhZhxtNbCHcq%rGBX^>6m`(XvQJvZq-4WaM?vz32?!%6BMY&JLA{Ns0%C z*YrcV97W+lYOUn$MD4}ybW=Tavy5e{i8wSH~(5b(vQ4n=Y<=P_}YhF%z zKLBB6@~HQ&@8GeBuMHpSWUs{GztZiai*(0=mmo${*kQtx%4l3$yIzCTIlHk0Y*Bvs zLZ}sv|9g$Ic>exd-TkJL?MxQa6x>tznf~u@t0Oj@3I`;*wma(1P{w5asMiTymkx%W z(Z5ygIzmcF+i1otowa9>LUfBre^&T!B!d|75b=YAI!L9K)Fr7#EA=)U!|iBCSH^9p zfWq_c)oOKnc{=#Jy!zh^*A~BXm!8?YS)|-_`x*=9Gr+}L$z=2((3lUTz`KBM5AqCQ z1X6BDV{k*^*A&3kAQplkKLtt-eED)nxq7#;bRInDOxLT`Zzdhgws&X}&0EJk+kQkf zdok{_RrzGJ{{#{d?J}a%3W6XkAMHFMRz{-sP-uso9y_LLdu4_+uVkrK|6bC-Ost1< z5ALyb$F)X?&viwVRIYW_IL8O}ZmWH<>M22{jN$8a_HNDO+btAQt^RgjE63b0+WotL z$Bj`yO#A5Rco5_|X%W@X zA4b5GXhvd^vS=;YevODr=8I5k)#^V?T9izadypVFG7O=2u}C0-6HegHv_H;HCr7U0 zMT6qs&D6ah`(&{&!)zdp)ax+?h+$$oq)6K-)V2MeByG>c z3psg~K!n<@;}dGTP-9GFqA(^uJx}9|*E0xNtFNL7E+_G{O`+DKIC2+OW)@E}_VPcM zqyLATtPx)9F_c;+txI{Hh|BD&D3a z+r_02aH@&b>K`U!lh|O~gJe=QD0z-~j_@54FdV!aTBTfYO46oGhB_l|Zc*YGq8I|0 zM?f<;``lO3avD`wW5rsPMu2zlWVoG1VtAXk3agSG-nQqaF=*jh&K3Ho#NkxbX>{Q` zYKpUv{>a>c!?M!NhF}y4Vd&fgUrGejkp#Fd&uO#|?~ilspH0Lv={n`7ivQvo((iRB z!@!PDhXyqW@c!}p)Us{4+}}UTdXnv>n?WKLuDSzc3lNxmsTAL^bBik)ofExF_nAm3 zKhvvOJ2%CI&QU!A7Nku1YBqRNt}QnXy8z54CO*&;90WM`r^t0xxvpJ}B=ti3a|zRn z2oAv?QetcA7z-#!F&jz55yn2hVNVM!1!IywCUfP(z?9oYvvzZqaQc?TvBwPd+@iC& ziK(!OKZp4BhAzw$XSaIunYyifrKXNaG0voN=b>XYiDz=c7V98(hbs+a!>nwC}l#pMLt|-GUT1WPFSQ zV}i1u^WjE=F|p>ubxo_;QnREo+NBe7UhZK+PQ+X!42I+Z8i+MOV=wyqhupaX-vH%` z2B2DfI2od3jV8~wuWBy^508tgjeM*@7=J(xtKW5bdmavm^?pK4kQ3~x+xqQw>}RQn9EJppKez>p>R z42s-=(T$;d!a5?y0?u-1(egKbqLV{^_B9YwK@lR%u5z+dxK(Uy@b-wp8iudhSsoqP zXFa*3D44-W<4L=VW-JVFMsW8YvU`M42)-KGW4=ltgHh2)O3$tk44AV~@++)@0wPKi zMlgdt1Sn?g#tFe7t=M7EVeS?m+r2f$VQw_RKco*6UY?w~o(%qocXw?3Q{`(+dWDYe zPKmk_G9iV-3=8Mh1WzY9b~sHkURPzD1_|h}H#L*u?gV$9>Fl`0S5XwA@zzv>IOw;e zA)M|IXxZiz*PE4H)iT*N!5?!RGA>A%a@*=c8b((UYYsaN9T!K15LyHZ&$xAJ_qGiIsknoP=Y zVIsuKjD9V{w~WRtO|8t>Z&HHC$M`yU(cPNvOarT9wV!2a#|X4me$C-4IEu(?%QSyZ z{}SUK$jgAB*YjP7KIB?e>7F^qJ?sS z5D)cO;&>p0+mo{Snrr!c=@-m=Ww9{YBPWg>2oHoOP3MZUs4nbAH(hcO-|ureAmI+% z&D$*{nyoO9W(eco=Ej;I{FllhEMngsHLO$Fk>qvGU6tL}Hk+`deX1Xm)|}>cw_g4w zlFI#3Va7gPIGbT5QCu}XI_ntAnX3Cbif*Er<_FDmcZZ`~Cf+hUp?L=d&rl#?yGK>{ zACYu-Zh&!~cY0Vva*IH9Qjkl6zlE z2pP;@doPnPw;e1DXc7`(?FrXTeFxmAnR1G{D?hH(d3Gax_jTBT`l zq_qEAi+VWJObfzo!&%=c93NRDG`@38~2DcJk+O|3xk=?I% zWa0bSTk>chA$P&dR%eLwuc+7}ulww-i0)+wBLSQAbpS__4Amn{HnB>|_jIMW3c)74 z)0eR$ez{Xb^!zep7qgQEeL6_Yps(BAud5^@fLRehD@SERlr6H*LCbZ}a@lw8uN{OC z=bf<%SSZxA3AD$A_U%ngOsg?d8WWI+Io(1V`Wc|(<+HV=t7zo#ris*~SS4X#g|(v{ zPzeSEExLC_Ih=t@dy%wc<6T)~iN7qi1yTF}NYD_DLiYi=`2PzbO6Fnfr`t~VeG$&N zvff&pA*97 zT%fdD%rq*FB-yGLUUC@fZ|O{g4oyWuP=fzWcMTK#Yn>fea!ScnlFBYt{IKuw5XuPt z6&e4ku!S&GYDO5{xdJ%6E!US@v2MFMG-0gB;+NCX2m;0{ekm+L38Hxx0lU zgo5KfD`jmz42l_R)zR0&a>lqzGN66py$=$hZklRGN20*gM(IqzHV}GN&xiB_JDjYR zFgCv_P|?(;CSxEN%3)r92_z_M0tw<-I+?IYoIiORKx{<#L|9nq{G4Uk!9f9YO%8div7T2a9W@{m}xN)l$BF7@#7q} zCZKfsNK^#A+P&ohYDY9`#Jo*iV8UGcf|Ep?E%`qs*5N;*j6c+PlsHM@z_Oe1fSD<3M5NkG!W#|@ z!$C4@2qX)K=7++WDS065+LSWFkPya@c}&-ShxHse>%=x)5l3}8@)xAfTVsxI?S8rZ zX;j0KTtYNY@GUl0(jUgeR%J?Ai05+hdcxDX2ng5t=$5^wyett}%LhD7VnaJktd+5G z9(6XHjaof$YZ%j~$|PFBdTk>Xq04lD?@_|Q@~c+WRP17oCaNn`Trb1;Jcl1w4+rnw}52ue{$CWV=wmU z7jEGWqAw^7kCg^~RLytJdM*y3 zC@~{u27{jaYcE7_Ooa`V35U{ZLu15Nb>V@b#6%(lrO6K<3MKN9i%#}<@v&w3iG9<)C9oAtTKFNtG^NM^tSjbDxu(Fpt;K}7qQcW><0_P{> zmd_I#R9{)vuB5&%N#ttUL;AkQqet;g`GYCU=}atoW*yQ6%DBu|Rk!)Da6DMY%co;B zAORkYjA%yx%*YovS-%l)+Z$lLFaJ>Hk9vf`%=0_RAb}XwhTqfP#9yQpUFyxh{FIu8 z25jQWuf2L6x9>iW@uUea#h)+3DV*;vn-nA?pvsFYlSWEh17SjKs|!2h)_Q6v?T3pg zW2|1<+DH3l1l_Z*Q3zkSS3q8P?p1`ZVmsu&WGRpz9TzfOI2OW-edb4wKhlvVd1zeG+T5lvpT)aszrbhW`Sdzv~eD3Dj zPeu8iKAUxE=K8tvYphf~QbVxVhkdqs$p8OHw}Wph`ArRUWLA{Qa)@5pR|eId7d5)3 zhstRSsl@4|IMb)8N3K&e&32-aDjUOq73mKw~zJspQ+-VB*+E|5EjfUHk56V zbb)(Jp*v9zFR@#3-fdR2jYffh+X>x2t+QWI86y$j7W{49`mgL3K81_I2v>FM*OZu- z{cq^(jFN9C`R|lyM9dd2`0sW0_m%tuCBLQQx0QTL$^W3_ca>O_p?{*Y-&68WmHf|2 z{+W_LQ1YWn9#is%O8%vi|6PfRauSh(?Lc zAnc(69&*1_nk*h5EtEzq&|#hp$vMsxLNP~pq@DcBm&UlGkD?Kk3xj&WqAkBw94Wc8 zBix(91-dvrih(yhz5@X+JDwjOD~*))sZ?v%5=~0}vYegb2p35P;y+~S_^cCKK*19g zVu`Tm?Dpp)XI_zC6pZ!Ma>ha^rilqLQYm_4-Z)Q2y&Xg)nB<+3`AAO%pQq^dD`(j$ zvjK%M|3P*lCv?Z$TD90~#IbY)TRz~BLwFV;XF@7T913iOYDdc~kpf_a?N6+Z&W($a zbGc*NY;V1GHD;u94fjj*^5m5UXM=7uN6h&Xj+Jxwb`HU7FRnpL#ByCb8jm_;^+N{hMu{YQk;72wcIf{kO8lrW# zRlu;-Sm#k?DyhZUb;7`q(KZZaAY^et*l0f|DcLUM!okLOZGV?q0O+#c-RWtp7w!AR zzFdl;jVHU|6Dpty(lO5w{vrvIf(4ilWnX5Rn4%};QX_Y}=$>rSPjW{_06sVAoOW7L zm4DRyuuLOn?t$&z?ZJ^hD>>pmU}e`CL|E*gpLXM%R{f@{>{^js=0j$8~T0nz=X2*p9q#7eE{rUM6D6WD)i#tsD_w1g!$UUy|!8zRL3g?Z^fHnx19BFy^>{vA;~a^u-AMVkyEh!?nOx_cS&gi$S)q zXcYBatRep-L+DV3W*AS&NNr)5!l8&U5v(_q&uFpDr}pR0PMF+T8FQG|Smw%jbYTje zK$MJch8Sk9JSZJZHTEH`$^}qiI;x5$V#}nmqwi%JvSC3Qx)auX8<63b!?Ba{tk6A?t;FwP*m_yA%+&TWf@oiwmyoBjUZ)Tw5Uap>K06mr+*q+D}zQm&O* z>Jtt~480jx8IHR|5y3$vgNPuQ5m-%;bVtv|+ZddiXSDfvr!okFf_Ie+&aoBwOPXAx zHkn*jmTmom&VcZ+ontpIrS~y?`np7W*EW;46f*?UpCriS3)IW9P7q1B*HDTU@I}I? zXv$^`d1N8iy2}S?aMaw#(nNFRvAjXD<^Tm{WV&n{TI=oQbXpKwL4XYS2+d9$EhlstVGqxF>iTAh5hzOaX*9@FMbmt zx|zDiM~OYpf#zd7($=R7P z8Dh*ost!rQlaw;`U~yluFnNg1JYa155)YEUc!h;WxYaK!ET@*!_PKD>gI^d$RzO;i ztnfvczKV8(6@uHm(q24We$|vrm)K2@LW>8NvJt404(cLIRS zAubY)Jk#b9E<)fEALe%4vaiYC&OqZ~7Ev~{2&?cFG>Gxc5hOtY$*pNNjwoT9o57L~ zqa!2|JScz@q`>I>gl_akPQ(6qfswN@;Tt#Y>7-{{+20Jg)v|dG8*5@yg9f6Q)M^h* z)?ZEvxND&2IoWv*DUxf@yo?nDJC7pAJGm(~s+dr~xY|ySD7cF5&6Us8J=;Sd`i;3Q z+1G)oq+t*A!E$Us1hk2B9G-oV0r+v+eit2v$>)cABZ6!GeMyHH5p)A$BiNTW9M`Z9 zjwR!FC!(m;OTKn_^wwZJ2?n{h$@!>i!y7oEB0~)0&bAY&Cn@u*!|%beD>P1I`~%%TO43F0YV-v(3;fE&z@_PBoKN1gUfQRmtmJVLz9W5E z!HV5Sr!VQY1l{1LmHdp7zoz8pl>EGsUr=IXTPxhLe;TpgVD><{TRg!QZ-O146qFij zQ_Sxv{_O)@>7Qt>T=sM6z1bPG6@2$82htj!+A)=#nwolW>T@am3#X5BCqGq)j_K(B L=Tbk!S>^u*0z|mc literal 0 HcmV?d00001 diff --git a/bs4/__pycache__/formatter.cpython-36.pyc b/bs4/__pycache__/formatter.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7eef87b48722feca59c00498ece559da3ee01015 GIT binary patch literal 2996 zcmbtWOOGSB5pJ?uuU<(r-baG9W22EcPL#|_39<_W_F&g^W2*9l?eB(h;s?S8m9$_Cy8iisV?kSXV_2 z>zZAA5^|G@sJ~*;om55R&{=VOapM)Yyyl8m+eo#DBmB+U$J*QI@m6|>v)n@*+QaEL zsoS0Wib>*QukZ&>r~Nf<;&fPf^C&l|xg5-f8sp5QMV`>7b@MQJA6@?eL^>f8PRIod z-P8#^^ornNRk&eI)@6e(KgLm_`ih0VfH^On@P@c0Ziwbf7B)pow83eKJ+Y6mEnX8h zG46>5OuAh*o)l^t8za^HACIL@R63M8hy!g@Y~=VdFhwxaGPpREc~H#E`OKI!A4B3i zR^S^YL5k%LeCYaqfHo+mk|c}{icw%rQ@z!g(6^B)rr^s!qyQmL2!voL&4rY?)$rlT z^Jn)D0vaA11aU5c&z?W)1|MUOf`Li~Ns*7z@l3_TOa?<(3T-e|b?yGrJ2lg+bUz2F z(!I`2S<&V)lkmWuM8qbJpuxmx4znW@k99)8Q=^fie}Jyv0agN|2vT4tlNp2hii7LR3F zc@bwb8CK3NVm1DoqxNvrx5M5k0P3Z2-g{gmv#HFD?md<-WLBKhqW7suGu=BH7RA}U zPiDg`)qv!l*payQtvA&7d!x0VyXTi>JrZ&h&oXnHZs>zJtj;{Ttm(o^TPN zb~dR%zIs-rm5oot>=kAmZ~opVsz{_>WjG}=kTO9orO-Cjb0O8Z2$tvsyXbVkX0Qaw zv)7Mbq+`mufl*M=@6;?1Wo_jT^+O1jeyY=4n>bHoS%tTuvh?zJDwi<27RHk2+KOMt z&fgp@e+g}x0ChMa*F=>1tj#?>zrBsrt@s8-LUWVF6yt>A3JEs<@fry#8#WKAQXy(k z3{eDUMs_<)y@_odzVvt-^Xq2rZKNZsssC*v={GQN!Y2I6u?}PN zgJW4J_#}t}1o+S3xV3pYI4v%a^eVV4X2C_An=YzPRTs)gQC3J!mzC7Wsb1ZnbfMz< z4g9Ub!yV&ypxp{|ynbKZgjBz?Pti@@jH1$yqG=&!8O@tf^ru;zt-jQvNEAsFg>4kY z=*h>&&mWzf932nBJzE;LjM5{9YCw1Ci8L{a9Hwp%v8i{5X5nxHC`8mc zb@c04)hizN*s^oB@7C)M^V{`S-LrBDL^h2!Qquv*`q9N>>we*k7-ikddgy(D!WUMk zmMx3>#={Ba2DSD1WSh0YI0ohMisO+$Q5>-a`zv4YFDV`ITzGhsX6I6=UqD{jOa2_Q z&MG@RybHRl+V?>t^%@b1WUO@*E$?lwOuIgy<80iq`40%njb+_+5$aUc+jPiJiO{1~ z){l>#el$2a{=Gf8Y^{_hGRxKn&^3?-UD{(0BRe0VYkC3b$*`9HUr06)8yj=KH;{_T zg!(lR3nPWn3PyE6+>vf{_@6d literal 0 HcmV?d00001 diff --git a/bs4/__pycache__/testing.cpython-36.pyc b/bs4/__pycache__/testing.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65af69a1e1e0b1709e2b5d772250ecb374ca7f65 GIT binary patch literal 38175 zcmd^oYj9jgcHX=(0Kqp!ky`HR5|ki;OJE3)Tv6n3C=#S3YDI{cAZDX$tJeTN$kXS?8tG&an|KjoY?WCQgO+Zl-F^b zAC6O%{o%^@o$lLnXD}E5QZ|1Aa{9iy`*xo`=k$4X-%~qxj=c4)|MuEqy;_l7wQ|bKvXCi7?jl2_4 zBX06m>V7`y;rb1KDj&aoZT_3+XJl?7+D^~o=4{b-alZ4iTWmJU zi_OZ-TC+ZpY9F3mE_%M(xb8NJml_SPywGg8FBL0Iw>>mlTUl|d4dkg+8#w8g=Sr>A zxLbGIqpa^{!z-4nuGfyw7nkt12cDh3{`?Kr@6CIzd%0PzDA#LFjzl7toQ0z2IzHZ5 zT^h+Az4F5B{AX@lb?*3;bAID)#r2n6x8dCU%+2|$*PXHD#!3Y(m{JWjmfJY{jTbLJ ze{FVMc2#~emp%IG?3GLNmtMW($kFlLQlng}I;+br8d7dJs^ zx$88Riw(zb*6TIjRh7HWa?y86dIHj=x-MRuq8XG*CywRSw=>DK}iNSjlFB znz~w$^-Es4-pG{~oyW^g)|r`bJ_oqCUgPmGyyUum=c;av&dc8W1K-7t=Ef$@JNL2~ zQ-8RZ^UG!5sWw+!uUv9~8Rf=Z2X|J=z7K=~2wi8fTrE~G%q!(m4I_Ha$=-1q7sl%4 z@nbWmr;Cl^Sa&aq&a&q&UI4B&>gO^U_YUAuai>bPl}t9n0pqX~0~&GOB#}5(_f3%C zQmp|pbwsMvS1M~hWpGWriN zG~BLQq>{%~&AD4^0>q@BixV5y=~k6#(tt`0p%u{u(Eo+cExYBV<;J<0>H3}X_t3~b z&?lgzivm~+k${2H-;CX ztD373qsmnbbeRNEDmHy#6c$1tE3XI!0_yPV#gc1!&?qic++4PyoN8^=E7o<}7izVN z3y5|wXt~H78j(O+3S(-DhWN#^;7*T$Mv=4X-YNTJ#0KHdIpfEGTIZc@4#a|ssRHzk zjvC4Vsg%KRebl(rFukbfz`2V{X#a9RspO8P^Ugx0R=Vw|vR|(h?>cV9CHDCz9Lyi6 z8Gu7`g+qewR0UYq$QfL@2s~+^uXV821!v*H*v!c>r+8s(dW;v-`eH_Ios#tQSnhhW z(kKHBHL%QFhxYr$6}JOX;Tb)H0A>luS||#D17MSpu7`M5ZaHPeCZTjs4OPsh;EIyz zTqy5MC$G@E9?$SAN7Qw zKOk>VpU_uu!Olt<%R?=J7r_JrX5u%!x>xqyTyPR_iEuqxFy_m->?~;c>eJ8Uu9xos zRZ5V(2v<-s$ZoL)Aqes7T(N{X+RtW8=I&NGD1Twh2R~XVR>yKy(%V9Pp{XVS#RuW56v|{?U0q$B zT0J#Y^OiF6pU&K2UDIUB=5!J))pBGb@0BRZ27hs!JcXZs1&JGh5Fg3M+_-`;|4!t7 z6vBI4B_JjzRZ^vJOhJfFtKoN|`5`r;cHmB0?GZscnjcm}5T*CF)0AE*CYPe9lz%+( zH;bQ7-Juad&M4*S-O61of*%zYv^rJXqj-F?k`rT8vGYkG{$X|W~9Ac_|*BiyV zp2E+MA&JCVhk`y#^|Y@f8IXtF;O~?8`C~}dVz;0MtwnA{@5k=N8?m+MViec$a&#^B zBe5?>(c@U_xQOkTfUN4^@J&u_;T9xlXVI&z7_oAyJrsZ>-j3I6b!nr=0h>s8J8@{I zG^=Q*Zm$--CEweFn*ueR>}H;XkLo$N0{2L4SG2X$_KnYMhgdUz(xH7gL02M+=+>8` zx1wv&`|Lop_3C`n!?asuT-9GF5-ccD=$1ockDrRN@-=j~@gBnfqQG{{-Ntb7g6^nfN?57@H z+>hUnzRmN5JbeyNF&6R^ckI)oJYB|9psPH^9sN`#?;+wW`o7&33b9rOMqSqGl} zp=o<^P43vI()M?w{x|SEjnaH>TVH*TS8NB~jdmRd`5|u(d543%;itCGHvcWuCuP=9 zW`wQl)D=EwK2thnd&ZWgr1aNOdWU%$gLe~Nwt&n1n5Ub=-rq_FGizyS5zGGxpYfXS@c%emf_H;5IFH;z zjyJxnxyM@arsg8MT2qFLXx0G3N%eMdiA=?-HJ4yop>ibprJi)=TBj~U3+yHCk+~PH zTs`M}?%v3aPhWjWPOiRkidvzck;>7GK{nN5!jL&1$~~Db3AOPU{hLh8$HvCH;xyV@biBVNhFqt zkLZacCew*d;|WYHoLu%q6A3;KJ%ePIi0(-pLV761bPP4{-T`^CD}Eq4s?Sg|CH3>^ zuGna7Uvvaw!aiKZaHj8a8pB(ndt$NZXgm>bJ!B`d^}wMd|MbL2J6$MHr79HKBZb0B zO*Jb#A1xFp^_VAVP0S4~b9)ESS&@Uh!#w*K6M`ZH9ue^<%z8VJi@bmu=G#4l~@KT6Z0%}V_VLB7VCZ&c%v7uTM38^)sYA3FT)h@Lg z#}TMGdvV;M_No0ij;aId0UUR#gX%#Xcd0|_Fpj&`L+T?q?omh7M{(S%KBgYVai4P3 zQ5^THF?9^b1L_g=D2@-ParGFE2i4=l{arJQ=52+Js634@8N@Z|-NS##EIDSOU zs8cu|QK!`z96zc)p`O6;W9qDW632(tIdvXKM`hIo9FMA;x`^YLdP-fw@tC@-W^sH( zT~SwYd{jNHp22ZkJ*%$a_?UW5#c@oiPpao}d|X{ub2v_@7t{?LkE>6qPviJ;byLmb zctX9XUczxwy{ulraY}tg<#EiYSJh{6JgHt&-+|+_DyY|SoKZ!!fa57uQVPe@P~R7E zJfoJ>GLD~6WpxY3C)90K!SSqGQB@qDR5ew{@tk@?c{rX|zG~o@RZaCKju+G_1kXF| zL)1$`{*jw2wc9S0(c&x+fsPU%Hf|~ylp|ryhbzVCSuhMz@`iO5jKF9xm#TNQrW49{ zf-{T{_Tj)swAgGw&OH&bzSCYwgG3#2qR6tCqiC|gEBz^lZUk!v)%x2B&SGMh^lDK5 z=y`%V<-$M$wTZ^m6H*Nw3#z=hh^%>t30J=j`#32ui)TIOk!hOEyKJS}!O?4YFDTtwI_g3d8AvRW|?(`?#GzhGIJbJaF5w1$fus&uAyzl?7L|W6QWM zuPn8bmbR6h=(R6GClSS*~M)WqdK{Z|Y6GxqbKTqRCum*O9d(lP|+BHm25RhXyhsue& z#GnMlIPwL0@Eayyti}grpsZpW8devuNE|PuhD-0c$nF zM6IhRFt;X;pwb?MKcE%*D?0zP!c5hlp?;0+v0WlajYf|oT9X@&RlhpAhKEQf!!w2x z*af2pBV>;%_8`nmGF$*XjZABaMS?coI`fPR_dmSG+GaxAkkc`-S=QbS^C=*|wX#3z z=UO{P;LQ+9sOMmqA|wpZyqh@p=9!2+jgNXz+M_usj)~K!fg%!1!=MDB&uj>W{=;Z# zp8Agk#35X;<4_-?VopL-2+t%UBhk8ey$HX$Sdprf?JFl7x`*#7c*ywvz+<2t0qYU0 zouDsAG*>W>giDddiTGbW+~aHU^q3!*qG;7@Sn zahwDo!n8_D5QtKn9~CWeFhE+ymzZmHf;nUtcJ|m#0uO)*AKwrY)|a(iBnDV4AnL&YB{Bm!mbId`svl(JKDcKj81k4N@_qn0 zBAu$!%TNl&UGuA^S9yS-p->dz=${!3m<{T%AVOUo5OG)nEa-se5fF7Kq{q60!l4#p zL8toT@qX!#e-}R3m7qU|*$s|PB-%PLxD(w4x6}4xTZl{=-1Lu0(<#Y_sHF2 z90Upqm6>vN%KvRNBYW`d3Nk#IAZOCa0rx_ zVt7^6w*g8aN9y#GPd=&rr;LtUc8l;oLwG|J3StB+HN-{et87M|A?~84?&hAU)zrdW zmr)bEgtwh<$+eQw>^~(zkNhbTL=kFnT??Hl=^x?)Jr}VbB{yKo8_me=LxT7OdqLd2 zcE){#7$TmZp#)enOgGWi^kCHQQ%ipw;XJe+M<6It!2^cC{iPjX3E)Mt)p&Ust{Ftx zSn_mU(G#{t5D0{#zbb~qv5CY49V=%QwFRimXrGChlT4z_uDWH90VCRb?{_pie+rGv zR{iO7V2n9q;mc-%8^M!I4repWlgW8hJ6b~~OV_|?0W*zQ9#w!+y6LbOk-&Diz|o)P zkcdbCw-9E_jm5`aZpPAGL&1qfM1sqGhXRfry5+Pm5bMmQhM4(O4-+JP@lrxqIGDY7 zs?Kb^^$JHjKZbBzdo+#HJAXi1ZQGzY^F02)aO9T}7HR5YfPYIPkY zNR1D{LlR}%<H~tXZkHj@KNtrAdVV!9G_%10o0X z#q2N{*m^+aA;@2)zZYS#gtIYafoH7BGy*=-uk;Bp77KpOV02J&V8UbU5baK@#gL7t zmB`jJxz-TFz!2UwH<8i`r19ZDU97;B|7pO$BQ&&n`YD_tenAF71nM4H+SmqYNh!cE zBi-*d&&*k7k=l}7U_e!N)JOw%)H=$99v2xwncR)`g%BSZFNX1?Ti<(0#){y9Wt7Ij z?kHM`?3+l3PaqlW}Q* z-PW(LC&IU*t+RuBv2i7~2jmAfgw59`l*g_NyJ+%O9x`|h>??D z@F1}lJ=kfkUb+(4>BR#Q$j%gxDOit~B9otRR%>2G9mV4=>3gKLyQfN+=4E|FJHa4? z^)T{1dP=Rq2;dgjhnmH+og%eN3?3oXDQSm*A}&^eNM#RcZ1=Pz@P2 zD7AS_=I(*=s6g3ddy5ceX}HCsBe1TGMt8;W!?E?~U}&2fEToC}7Sat|5Yh@pa!!WY z{3K^0B;pK$jd49lq}#0o-371mHA4^e=Tg+?n}6TzvPzC)}`y9+iFl9xhk zd6#|xqqzB|*2AIKtk!BH`0<&2ck6*phR|)wMjbRD2IS+YFAp!jZUEvMfj0EH8CObv z1dj?@e#;AW@}arWm*OS`7f=dP7obz<1oK0MD6lsAT7uX70Fi}A$xu8JBoStx`L&)N zj4oS!Nf$oZ>0rv>QwNtUW@)7(ZY=%!N@K#uuT|k)#NrXN=tGncV1rp10hZ))K}o}f z7ZZc9kh_L}ZbYdggqZjsBG6F~FhqrN&J;o}^C+7siwxV!=q$yY@SnQ_GBZS-La&je zv;!=kT;QC_U>YviDCn#GhM9$L_ENG|`%I3q^wmymU*lMb+9!lf;)dyT6Sc2@Rdz&3 z-Il^XfeVfXT@2cOE|e}-i(3gqss;4QJNDW*AU=9oR^{{`n5owB^(#i6=F&8ZrBoa` z%RqAZIG$KFVx;p#^GvD|?dSrgCRxB4OhLj~@8Vf6NM$C43ZX}WjVx3c?{4Axb;31` zl5|C+qWhw)CkNyGfVwPTyTn<>>2xM33r6||p#v(7+K;>dyw>Y~s97)7z}PhIf<+($V*pFT^3Ox6#!La4R;ZLAz@u(Kek^|@ei4|*mRL%q^TP$IN?$vhM(C0Gr0 zx#*!TS+OP+nunNqv9imon7YjQIFv^}Gz+dQu^I(bX4;+^Rr@Y%L_~l-0$2ZMLxc$H zVJw;g^W4mbu|ZuHVMv9PY-aKEpF+|>7*Hc;T+F;GzNDoZBWTK;3yB5v80VT3HS{HI zM4+sao@*VE1xw{PDVT=HyAv^34fA4;kNo>nE6t zYyP`)@{Y~D6lwd-91>)8_>PNI&nt^;ad& z`)~cYGl|8aW_oFlY73WMnWZhppF{RdS*D(6WVB%|CBeWdga!On>+Az`v|IFlTKJ^J z`mo6?v!yyM}tURulfAwd~mJvXa zL-wxhC2_w0lRJyk8$5ZX>j{h*<{jgp7OW2LlnPK!OG)jNd#8@*&|c~Kr0}dTpe(oY z3vesKJxvV=%k`Ri!7%NCX2~6+=J~I6Z7}27M(y98!JuFR{5AtYL~iYqYdZv8_YCm6 z?isMly#u;z2jINh4Q}uI@XB6TuSayM@qM8@)CbDjK=uvmweSTYZ9Xd=Lr1S6egWaf zPvyf;ZCM8`+O`6lK!*h3Bf#88*utAbmIew2Nz?>F2aK%AQyVJ zt`5exE!Jsa(M9+t@Q$q#zLA-lnM!#fyh>rJ zbyVchTy}XTHv-KDvgnNLSRqyikxON&nKL2JVq?h1mS;T9J=!dLg=aK7$by?ee#?U$ z8I=xgM$^Kk=pV9RhK1n(Lrpf03GH8l1#%bN9kD=$$wo&dbFVNxk!T>kVb%+PHhi;# ziv)kt)CmBt_ao}%6YyohG786W14}C$_>E~7y~`CJSUEv(%bLRe#?sy5DC;EPl?$GO zNRm)$b@0Bl?OjF39=0T#7>(NgcZnxqS|jQNYH~+B0Sd}`b0C!t6Fz&xpwhMx>0*FL zVz4j{8CZR}+5sw#9AItM@lAqh-OiOc1QK&rZ|~A_&Rk8eZ}+JEV2K6dpfrBzRr_t2 z{Uyevua#H3^k^Ot$M%BTSan?}LA*ixE^8aNSeK%(1FYM`sW-T~t|36x+EUSN+6sn% z10aK(Tk3RIGzK`TVAPb z5b1a}a~+Q*x&Hi^DS>VF`emQNM4rJj;FUIGYL=n-Lsq}$+=~B(*T|jDX1cDiHx&w= zTP#=55YNTFgW6Co*cXIK9$CMsO(zyJpOoIT`s{pzOkLe*jfOkkdT4#MK~Gx4Iw#uJ z9sTz@04;3Hc&$P!B*@%)<=6*gj+3>8vI16OlKsoXdm0lXI;u@f@m#(O+v?O{fvvYH zi~F1{EbedQz2Cr+a5o_T4eg@ZL)YK)C@a0+6}YPvdjVE6Cn8A##yl zDE!!MmL_l2iMSg11$A#0OWv>kB=&D}n@54sjQnlVp%uR0Wo-UIJQP7}Q=sao5I9cs zJ}`l{X-xpHSupD0(oG^mHm;}(a2B_YqoECOR&kseVHZIJ+Ep%m^gjGN0ze%MJ`?ar z!`T=bWR}7rIODcnuf)j5n^-8xjZOV2F`l-@FJbP>Enp`v@&vu3X9mY|4xAM$6Cftj zjX4p8>zHg9w$aMn){ama(k^9wJvWJIS{CnVxg(#3&?;~2J9q6B+QlNydXUW}WVQP& z4Vq0yTgL}eYq+q5W|v&Mk(KpemQ+Xs!-dB{i;n@ja3hD5lAHQE0W2tu)iF5Q6+C|(${TQZy|!$bYiE-Ou~d|;*w)? zt;afjCrq3%SdLdK{umY!*0F!oV>#@%NgxHlFix)qD(vOU8Y1g}AWIC5&r^dy4A2#K zv4LM*}RXupqT&}OkGNC7W1ko8@7Jh%LMy!lcXRPPFugZF?ChL3etM7fa3q0@OrUuV| zMHbq1gT0;MY<7#ci6dGQ3*k-|cz%T7(UK1okxiU7+o;#Vhjo(zg&Cd1iKad$E{(zt#SqDsVTe0!+UtV{%yUq`z_qBAiOcO_<5=IyWA=RNT-d-F#v4dH86mlh}WobMmD7 zMzj3pg|S&}`J80X`4|kxVI2|-BCIZwPZ(7)FpOY>vfAX?vuB>1oHnD@+7WtH*ANyt zSX6;b*mZlRteg7Z5{f+yL`%m4Uz5ji0|JvI>_g$AUd??;CfWy92bgS%}@q_cZSt{JB zW9TR*hp6v$TpH?Y?~-B#6w`zCw{6yq2dPse7^Gk1Ajuq%NF+gii9?%)_Q?RPugMM& zNzM+?Wn9Pr$#lq&KCu!|b4lG!VX;>X+mv!i0zzz65(5&Iz*}xyiYq%TLv#L*p{A~3 z36@=hov%8N1LYl|g0mBJ*}PQtv8|kQrOZ8^V%ZT-4|4mDsaSBEwaatasBF$`pI(w?JBTt5FN1hGV z6Pnr1h8PgATj;IaW3P-6#*!3lM2d||$Y!xktY)*si3V6M7qHN^BG0gO?5&g<4&lYf zTNymuyjkjX%%+qxCTt&vCl5nXLRK3#!Jxrx0-$2u>)RqSQlHv{ev zEJt0!M{gYN0zkkt6C}Ds!cJ3)EmYS z@27afB8T_Wy!?kudL|hi%qcMm2m`=ZWdwi3Te~5G{I3v#Bs7BT3J_$IV8!jd$RZ0~ z*pUd2;{t32Op0b~qD;q}2G<|1Xi&3L5X@;ipq-HRsFi4!2n|{i{0b||1#OHCGGLx2 zJz+meBptJ}i7!jgXV(kFGeTK#it2#zYOQ0U1$n{Pzg)z&vert4$XJD)=5}Gi)haHO zIA|(!gK2BK)pr4p02IM1*q^Hr_y8F62R?1Kct16d=w*qQfO#Kh@?nn04Tq@Qosf+z z8AAXy3H1UHzCd%J(aV>v6t285`{H%%2kedGm4kv%uuKer(()R>$anV>9K_>D z9Kd*dFfcmB<|c9;F~a*elM_hVLj_K**zu3?8QvTd?)xSCo_R0x>@!UAOkQR3SthSB zSzw}=EHWuGp@*_;X!sJJp{1qmYsJtI%^4FOBbtYe9@)Jz{CX>I#PokC^-%lYhcwS9h|Tms3c72S1;^N~aUC^k{lGy(>MGPGM`_L^_UNBE2)6 zmfuL4|KjPr>D}pl=^e;*0QvbB$1jERA^c0=jK>|4kI#88!EDGkg!-A|BY3(qeI$J- zos?FT&;a{K6n+XnpB?OM{>yKgU;|)?B{)JfkuRDg)sRZ#8bSv@D2NS!N4+4z*t$K)zA+fI5idPW7NVgySxCSUrT}ZuJp$1jjw>0esqhc*z)6h)rIbO0U;Vm+~Px8j|Os+HOpo?J29PS-K8Dos%8eZAu z;>iuVtu4y~H@~DYo_`!|3z+^kC^${J?3_bc>~Y)3v@!yg};4n8moZ4(w<9b0DO%CPBLhWqQu=MmODo5=*R zXjT7IsLUxTgYPI-l`e?K=O`^-7?S`8Qz}&AbOvfW_63pbJkhPmZN$m2WVBY&Vto%M z)1lBNY@idW_Y{X)Sh);$Yib}xuPbhcx@&$9Tm=m{m^gJI7J&A%Xu(9@NIWCHVXzjz zof4VcL<3|HFu2*4D#&3sQE8UT3ZYnvvBDzf9PEz?Hoi@ zMVQb1tmpeC;8c|^j6ujiD1<7VD}f>6lPkFeL^yY@_yh{Io0~Z~Ju`XoiOG{Q^D`&U zojP^y^h{oI%2Q^8iFLW*=Sp(dTx82DFz4kEVx~{B8N59gDDc$X+e5}tl8@sN*uzfT z+z^;;vz=kBUZzfq@$7h#TJxx3ZTUeBc@1 znL?}V7Y&0Ce8_qcZfXl3d*6L*CpQIiXCGT3Z`m65q$pkA97SQDo7@9E$qkGrVXhq6 z^Fd^9Sdn2jkoGW_tQgY^cgS{j& zM7SdJQv>@5i`lm8w@~0aEYh9DMURk%d0Xad#`A?&AHWTbYqR!1R?cBFuD>Cwso{%7 zE68;1>TRB4d45maSJiDmK*|G$4FHfGYm1JRTR~;dW5yu_mi$1`7 z0TA7Eu75~)kVTo$wlDs7^}upwYgiYq;=_aTNC3)n^jkud!lDj9$b?CUwj0+7w52}l z>AY{BWiGK#zNZHQgC}!aEDV9}3cKLJ$xuk}i#X}wtFkJgcMEb#L@HtKGQL0zA96Qk z?&342{qFL^*4S*QlRwzr946u=-tU;tM&^n`+fcEu1Uuskwp%O>VJNmKKRG6gy);2U zJKiAMba%Fy4YJMXY_jV)<^UZ6%#F@@AuqCz1XFq+%`l^b?_ULqwcZhuTc=A6S^GKe z+{ib90oW80_d^@1^Xg&T=l1GSrkxZG9d64ws_kNH$B@q}Aqgna>We*QRaL6*U{NTdaUDp>meDxfpU%hF*MvT=B z#``3AoLPQUg0xR?BpL`s(>B;uDkOfYTCN^dU6J7VUW4&)R_Zb%r zJeYB@gLSag{nrS0(ZOR0*=`Q;UxTgsgKD#&r_#fpfc_9pC|Z6ST#??-qi$~;$pAyX zc)-5Sj*2`SV_i!oG2_?u-DIZh%{N9Tlx^w%x~?Ejc>Vc#Ng9U9fQY1$$6A#Js?peNR~ z?hr~5R%RSAa$-pM=`4Ni;+5~9o$X;ueF4YpNY8x*X9iP{fDuT@_ySw}7uffZC@9~` z8PNZY>al}BxT8)z2%o@7H^;=n1BN=A&&X>)*lT>3&T>sGMmUhgvfNy@RQ9kIo-wn$ z9JHF(@IPUA`8aZW^iAhex`y?72A}p$Fgn~7W&1(vR|qX(b;hm(HA5D&4}jPPRed`E z_EA}boy+Pa*k`Z=Th?OhrP#TxteozGV1g^SH3*YZ)7BuU*RAT+-~E92>wSPI+mj9e z;>NXEI1=fIe6#rZH;{BA9+Aw5fB1qLKYwUK4wDiejCeix!~~__FnK5P7ouN?d?C6Bw+W&> z(y;b%-;U&57rJn=DB!n#vcN}H2s(Z&cFFg+Y(CUPwhrG~#TK?ut$&v-l*uF7deFaJ zu3uZ6%3@1id?O?`W%AgU1~U{Z^kLD9d@fvm=Q8FZ!tW_k(&N2JwM@ zk-Q_NYx*D8C4sx068aX-v9sr literal 0 HcmV?d00001 diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py new file mode 100644 index 0000000..cc497cf --- /dev/null +++ b/bs4/builder/__init__.py @@ -0,0 +1,367 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + nonwhitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + USE_DEFAULT = object() + + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: + """ + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "" will become "", and "bar" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, str): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + def set_up_substitutions(self, tag): + # We are only interested in tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in tags that say what encoding the + # document was originally in. This means HTML 5-style + # tags that provide the "charset" attribute. It also means + # HTML 4-style tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + pass + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/bs4/builder/__init__.py.bak b/bs4/builder/__init__.py.bak new file mode 100644 index 0000000..e087f07 --- /dev/null +++ b/bs4/builder/__init__.py.bak @@ -0,0 +1,367 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + nonwhitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + DEFAULT_CDATA_LIST_ATTRIBUTES = {} + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set() + + USE_DEFAULT = object() + + def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT): + """Constructor. + + :param multi_valued_attributes: If this is set to None, the + TreeBuilder will not turn any values for attributes like + 'class' into lists. Setting this do a dictionary will + customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES + for an example. + + Internally, these are called "CDATA list attributes", but that + probably doesn't make sense to an end-user, so the argument name + is `multi_valued_attributes`. + + :param preserve_whitespace_tags: + """ + self.soup = None + if multi_valued_attributes is self.USE_DEFAULT: + multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = multi_valued_attributes + if preserve_whitespace_tags is self.USE_DEFAULT: + preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS + self.preserve_whitespace_tags = preserve_whitespace_tags + + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "" will become "", and "bar" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in attrs.keys(): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, basestring): + values = nonwhitespace_re.findall(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from earlier versions of HTML and are removed in HTML5. + 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' + ]) + + # The HTML standard defines these as block-level elements. Beautiful + # Soup does not treat these elements differently from other elements, + # but it may do so eventually, and this information is available if + # you need to use it. + block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + DEFAULT_CDATA_LIST_ATTRIBUTES = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) + + def set_up_substitutions(self, tag): + # We are only interested in tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in tags that say what encoding the + # document was originally in. This means HTML 5-style + # tags that provide the "charset" attribute. It also means + # HTML 4-style tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + pass + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/bs4/builder/__pycache__/__init__.cpython-36.pyc b/bs4/builder/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9a04fd63a15ff7b4a9e270b28e5901333d893bc0 GIT binary patch literal 10084 zcmbta%X1q?dY=~tLlAt3q9{?a>@h7{rdI;}lGpaqvPg-txseb>q$KZnmGJ=G0EZl4 zpq_z5BH&hPsd)F8Q%+lzLpF!(DOLFo@*iwfs!}yM>wnRBDr|+jw3!PHdgm;e1QkAM21qWnt9{hdbr22ylIRTQQ;3R9UDs4dOWRI2NN z?&z{K90R2h6k4WZwu(-%WjPjL3e03hW;G3Gf=xIjR&plUq*G>PXNpZZ(`?$A`9fhc z%zCaci_aeE&MAKCNO9)SJIjjbEy~{0XrE#x+9p2@sCl&KSOM*VY@cDL+5B_GSwR08 zwt)Txeim@&*jaWiSZtkl&Z`Pv;OCE2RXL#EMf6^9F38>s)cZ(j&l?JVWv?XuLsjVa z1EsBfl|r8f^hM{QguXa{UPz(;R26UK;hDarEMG|UyPMm;CR{G7$t2?quN%b7ugBG8 zJz2cDc=)wwHpGV|s3-8i2>)jR*8?D^pPk9h_sIl@O_Je3;qZWpHSMPOem}7^Ft1JgG z_H3mVeYi51ZpC%|wjaB0rE`#2!^Ug0Ap@DBmek%WCm_8t1g#SS=#O?tuSjHo3zdDl zukI?hl*bE)T3^{!``V(i3;g!gBe32R>k&}2tB-n^24v~IkJEGI!uISKbTL?{UGpN& z?67V7ab$b!DHpL%Jp%+r6+tTjt`lq6C1%l+n0`9~usDUrvM%OOIHz(WyY;|}qNF%@ zOJY6E=tvVwNIK)<4YVQ?nWB}|g_odZ2%OPK5%7eXFOZ_YMHbVxK_yJx)5IK62zrL? zx9J&E7pYgH-b0{kS3d$LGv!c2()S-`@P_y^JT;D#Lj&&>cFn%gRF1U60w4-<21Bg( z@z$OtzU>zdwYb>VFsA-wy|3?DXaOp1<1j$2fcrVEX#ZSa!@_n^!!w4N$N5$cBlbEFKwYcjhCD(0*tQ%0h z?7C07UXVT!XE3Q)r0jLdh>vp_k(CrA7g3QEWoedSnRPU?Kcbl%NYPJ`jf4{fljh)* zodS}{bf<`9G3X{Ac;u7NJ7qqFe469e(4_cMof*<#5M#4!f|XF7Vw0?la*j>0X_Tj- zO=j5abJdxLj+#SHrj1fvdWi_?}|T|Si zF$9U$zLY8Gy@Fl?5(aIzz*HhnZxO>l2|){V>v1S5rIJ1P`?TW;uVuHO$$a-IsVnAs zL-iGXYHv2|c*l=y{Db^#?5Y~J`4x2XY>XUz*w6eRu-hRZB}Uo@wj4CFA=~ZIz;@v8 zjRv?ZCC=p(YorsRioM0-*l#z}*%={|G%YyA69;mZZ2EdPio=%Q<0)*7gEjn6e9tBt z#hmu7joWJv?rgg^Z>?>wxpy|Vw%xVu?faYS54Jb9vSH)`+cw|#TAhIBP_#*G+V+Cr zKyEwYwkNnS5|OyV`|lzG6oqAsF*CrGB%Cjk1Ao6yLn$yuQgTdvLNH!7(IcMb6BKy=L@jHXbpt z5rICF$p>2-?f_fK#ovmyoOxRy5Q_XJ4{8@eWvETx^hJX%c@-BU5Hf2rUyo%6s!U!hBIb6K7@IMP`iTBk&k; zOs}&I{I3t$H_yF0US!uehu(#q;ClfGkKyzfu?W~XLu-u_`9hP1bsv(PfoS6$d93+m zW#H9iRlJE+EJF#RkoB?6CNrZcT{^HI0WhLHC`Ru>e$+9zO!<<5MoCH(GEN2tnVbj< zkTZ{@v()zmBB1y=fYYhe-l_al@@Z!D978JUDhMN>6k=(b%o!=Vls)LqW!?KZfbaUv z9Wag!iX7sH&cdFK z4puwY<zwO7DiB@9FkgX#(enr0G4mRx)w=P%gZCZSbdu>>IgHeZ* zW`juS3&agJOoSQPOIX_KO6NL+63vOGy>fjiw^zdBz?-ps5`Khhk7TV)#NH0=umg%h z)2oFLYmg*$P%@T9k}#Y6g^fm1kPQzPdSQHJ-e~(Tylndo$yH$+2Z*{K4;KTZlQ$Jr zvYD4w8)1k&QofX`pc=2kOTcE)N87z32TK8Kp2)*y!(e#aL#^~8A&N zEqiHTKUD0s8VouJxy(%Rodf%J-ZLS{!hVK-n>7=h(L zBXDE8F$q#TgtzU^(O&mPJo5CGPe7c2w01(KIu|u zR`FxX$wQWYuAa8w18YYL)8P3U`}bqG2ILbTsqkRe4-5SQd9cxT-w>;)8}MdV;Vcvu zl|wTw^v&2j(u9bMyH?+XFI;RJeeATOv^l z*-8=^mQLFFHT6RJF42R>Bi(%$f&elBjsrD#eeniehw~dgPh`hil zljI(Vr+Q-0^Q0jCmBdWlmhpqVgbtX@-(a@r6=X_*oNE)VwF&2XPMwFt zT~f=IfmZL`JTd+*!$QVZ;czZtTp4>2mrzTJE|EGsZlk((cO#iuyR*GDH8`*8!XTku`Y( zDcV7H+~;RHd~$;!9Enm!q*0p8ViPEfti&c!TCB{bP)@LEHiNQ+`12{0lZZ2)Mp$k^P^#?{O~rMy{Ro`ryAw-xg6?8^Ip3OHLLbCunT( zE+!fp{G>5+c$6q4eE%KN(DFWgJQiG$3Yu&J`lFJPRL~?cei!4OEMNZj03J>;KQ@H~ z$fS`D$V?g$UBoZ|C_o$|oJmrLq>PNAQ8+^1HDK{W4pFoYnANM9Cqe^JiYO5U_7NBT zA)*h4Ro+c>2mtXpddIN#n~BvXZ}a_*xc(0` zI#H3K^eST;gU}H1Mr!O&^((wOT-U|1bsdL1Mxhz5C@4Uq}bG&nY~ zjk7=@3pZ7jKdzAK0BBH zgN7#uG)xhup#xJKxP9XYD%ElC1|t|6{ok{7sV`9R2r3gngp8LX{bOogmh!DfRc4R3qM$Y#<&|?Ga@^ zpvJ-F*5FuI`Wrz7*Ujmv^(^;ZWR!>H)LL*b{)^Mze}jyK#KkuSzeoS zgK2cVNU?c}%Tr9A;_(!Vrzku{;OQ=yBJXtf%NCpGj#Q_B_s+8m>=kyAy~Outd33geOMC{ z?DwGa?ua6`+j}e>pq$BPQ|uq614tK6l#HuT2x#CkZFLc0LG0ytsO802(!f!I?JBsJ z7<4C?Xf=`O^$=IQyjA1O(RPHR`>keDkao6XLT^Gr((P+|&#~b2c1gP>MhjQhP63{t z*TRi)*cEl|6v)HebM$95;aD_$1JM_!SZm^Wn>pahRtxGdDZuw?0o-qSP3{!QDrbD( zF`jB>I$nkSZHE7_yC*3gSj)^eHQ!m2ZGfQ;lKSd=# zisAnGJYaB_oKnMY*1Zn-RgQ@|Zj4cecw6B5I~1)%+j$Q=e;?@sqz{pF?KnC_v{M}1(j|qMjAvkl=5n+yko*+O z50ip)cbo~Y4u>cLhCv1*?DP;xa&*B1%AO|0bn>V-O>mv~Dv2fRCjmO@J8(;!LWFoS z&=UcvwQxVtv6hS$wilJuEji{{QcJ$D|oh!UX+>K6%@AnPdt!jNh zz-WVa*nV5ynobBIZ&_tYx3$c~-L8lYSO^4Ztl~bFMgV^ltD=Dw-0qcj;<$5_Kk52U z#ShVw6bH!Zl?S5#YJAY)%LO5g8S)1O86x6i$_mJq&D7hwhPoq%b4SC8ed?Ve8qeZT zIl(5)ZSrTXLM24!QBgF+=aBH5G*nB+t*ADumi4mMyEOKVM&%?AzPemGQ3gwQi@Yyq zTGIS5&KsrP7&&68V|9|$C0X)6ND;YcmSJd`X6p;ZYRQ6&X_9}H{;*p5m9W}O_YkrO zP10&3sUHvS)(_SF+pv-}#0OS!Cypor+3bfpomX5yG+-_^$&SbA=_=8Rwkz+=W!>7H zpi_mRK8M@zG&1nX&9HNj4{SotiLgcSBfAMIZ`1kgl6h-UE@CAhYzG1$osq{0AjC)i=pZ(C(W`8tWnf-lhYPLv_8XYQRfH|nmYX1uf CNg(9_ literal 0 HcmV?d00001 diff --git a/bs4/builder/__pycache__/_html5lib.cpython-36.pyc b/bs4/builder/__pycache__/_html5lib.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef1b98c7bfe6a50ff03363640e7eaeab487881e9 GIT binary patch literal 11599 zcmbVS+m9sIS+9FnUuL#vc6K~#J9c~RwcVC?HqJd^vtI8l-ryaZ?D$r}lzX~rX1k{^ z<5M-evrP{P9>dyFVjP5!Mc^gmf&2%&g2d&)FMtrDK!AjVWbwj7o}x(b`+aqp?zx~q zx9U{Yxqjz--}!x)Q=gxisr=2)|MRUkUeL6E*Cu}Rc)o-yn%6Zg&^#^BH*`-AjKJ&} zo*{SBGjX?iR^RsQzT-Lll2_`xo~vufw|nJ&#jEtIURB=BUd^i^-|5ZtXT90}3GamD zmr$?n)%$baTz}r1@1OKe>e{^p)O33b{Zrl|YL&eQUe^Ynv$Sw&v!eb~*SyoZHd;iT z3hF#4b*ilMrWTli>7D-e={K~2@rL#dEvN*kGHw{E<4EwNZDahUpFOiu7V(WGf4Cs$hiFxqOh!{CKD zR-Lu&I85q;;o$D=PMj9_Dn!M~Ag>rD?v@JIJ3C>7(Mnf_{eC!zlhV~;JHEFSCMQ-} zcRCxbwO)7&9S$}UbG5b6)DzoZYenI8JXw5LZIj^qXW`)zuIMt7Py@d;@LDIIJu9?A zJ9NU*hUK|fRV%RH)4g)w1f};huM)UH8TV>X397i)WR+%=fsRI>c{K_fx8r{Ai@nZT zBOW%^xLA!=BgVolCvIM}(za2ZKi)|Usf(UOGB(DVuZ=CW5bIr#w5RLZ806t^tXr>X z5E2b+F;_<2H;3DeEj7H;2||u>r`6jI8|y>W2zT1O?I85SL33c{skjksZ*2`#e5n2M^`@CTupNcUkG8^gXT1{!Mdyiox1|OcB}yzlHSNTT z!rpq~^jm6kdrO_ftda+VVSAg5^MkP6YpKZ&<_^*u{X|m_qSep3=Wh@D;rUK$a31WI z_R;yP;hnHI++t_vUyVDx==`;{;c)ZZE8AjEUCTN~@^ z`q=zX-?j1-Es-H-W%TZ`GK8gw zLVq~u-TS{v&wk70#wuhEeCntWLY6YHakta!HP*t0)C-oIx@U2Fcyq%$p;8d$3DL?W zSaCg@^N_6J_Eu8aA3`mlK%HVjBu^@2(krB0CG%|+w&Kud?-|PzE9kW2q|%8xgD7qd z+F@cwu}W%#@NOXusl?hcNHRB6osG_*)hnp-%JFbB9CSt)!a?J()%R-c+bwl1+ir

p~hSm4!5NwPN52q)Fc+(%|YEz!QB1TYPa9E0y8< z^`Uw(12RP-y!q_`V83?{JKfJ#vuVf_lT!s)lay<{kb5&aFe&vIdU*p^#QmY^24*#S z{1EMjc{Vi@&PMhXw`QP^^+lqgF`B;xJbQbB{(!u5KBYQ8Ozw_aWDABd7RrApW9tB!!|}ysxc`iS%9> zDw7IQ^Q=rk+L?lMG6h+J3Y|f@8_WhLa4!e-U=H_6Fdv-6y&5b8r*N;`ha&W5z|9BL zISjVarquFB8u-jB`U;Yzv!fU%-O1@Y{a z6iD=B=5pBDjyvnyy<0>SZB(ToET`KPTbClAm{1n#%Y3q-8UaqEi|pzQI?5gf^eRC?}=jw zoN7{RQVL~3<(EjM9_Ua+2JQy#ChjKgRt(y~M00HInNUlBp{_Z$Y3>NCtQAH1gz-nU z-o6neZgek-!~Tr`>psJxI&7VE=d=ca-wC30#&g59ZrG0fyRFCfxC&x3I~{43sWQ(p6-zTl&7Hl@nd*8*`i?Slm@Az zpID25ynEClAU)R}?kbeq$`DGThC0HF98#^I6}JwN*G^4&A#Y6b1VN7{ZnA!{E7+!3 zRl=PBg4Wk}Q^Pq21o&wr*B4mmirUNeij#waY+prynL>ln$#3VPUV#5wO}4Cf6P^>F=OjYGVQjYLKd9C2FeK6k|AlK?IsVxkQ02`99i50;jdL z6%K-=lrgzr&VW2SKxj3seC0O$m)|0p!W(0B<`}`5s9vy!fW=G5BQB7PE<2C+7YK#n}3>V^36{8oO8;>Nvs;*}kWDGk~;`dc1ELino|lC#IK>j_bKC@Cy1k z8a&l&DEi(nMrYpL6bq@sMYmnXLoU7_VCgor4-D}a%su@BJt*DR?(6rB`{ueOI=B3R z`99$kmK;FS>00lD?O0!TN}xwy`L(B)pLxFd^z&b9yz=Vhmu_5XoRN$(%*h^`&yO3o z-n_MX?dE>Y69tiF&_({ylZ$=|mcoS8-iLBk6+XovK>f-4+42JE4fPS65OH&W_eR$W z^i50cNFGSsx*V9hmb_Vb^W+K0zvUs{ejh5Ero>oZfI=TzV|(3N1lx4&ogYh?o0qK{ zY5rhA^2=%dI!r>={zxCZDAYNi?h~m|$y#c8OVoKn>XcCDzobs}h&mKp_aj}+2DP}% z=hMuou$NtP;P83Zl-4r`X+v2REkD`NcAcHK;~L+)Gkc)$s`l2(uLAdXOQ67vXZH-$ z8^{Qzj9}&?pu7~;e_9GBK*uYaV!bNzK~=W}s=>`32FrhN-l zv!p689?%xJqNQ~+U!iw?^g8vk@nvmS-+2(U&4H@=t_!h(BTBJCzyJJts*z8ue9GPY zDYuoNlTzZJJ{C-#P-Jy(Vq6L;XS z?5LMnMKls&K)uMM%0yJC-9i`&kqbl);>1P_svjx37Kzmzb_Ve85Dy7qd$b9JDjYP+ zib#++>>Obzy?@Z0sIW%~Y(*8S_Eyg` zxz6O9Ol~q+Ve%~|x0n#g)fSRuCNtsNVXv2)@l=tR`vq1K?$90JTqC;Gun1k(a(S7q zBdIA4LmP#~kDKQbuIMEsg+GNY2oFfS7u?}b1rF|ZPzqe!ouC|4a4!YbpoY7fhIZje zNkA8#)ErQMK5-dReF=hG05?bDb%i4?A))8*(|V)p0N`K*K+J716NFc1P)+$r4smco zF1et8jw`b$5rvq`2S3_zDX5^CLdc~yCJc7Xz04-C)Y~YNp;Ze{;%LQ)Ir_)w-3XQb z0#|M~PLUuDr)+P^tSkiiEeoL28-ddZ|76pNKASGe*xYmwdP9Et&N6j1h8~OIT1Ho6 zyX(MWN63yYD7-HHLW+;ZN}@m0ET#Bxs!fyEC#?AUfRJLv;}}1kD-~-}S_hW2dc8^0 zUfo8CS1y_#GS7S;ULE1e_x}o2#f{fY?(mvkH;E6Ug$a%nmH#&b3A4$Vl9}`e7;+37 z$g9624dk(=k7kQ;1y28;BPP~ncrQ9;NIElrKQZJ?F(exuJtE!iRPe+|g1{Sz*$W4$ ztRHQOBI<$~zW)ysW7UhXvcjWx8;489<8)#3PVOpdpCh;mlQZS&KTZre$;qO^)UbtY zb9*6#Jh8zxox|MZ`sLgEC_)f>Y7Yl-3y!Ss|I-8|rzR%9Ut^_NJrZ%bLDgL*lo7>I z)riTjGoi~ju>#_N2&^g;{28vOjs&hYLY;06m;4n=+qjA!=Y9!S#QuL#NDWIo8!QG70ESPXj}v>kR}acq;nU)tx5dK-9lRGz zc@i(V|1l=w!LSk}h7k+|rWvRCPL9JRwTk)(^wBLN(59#%;4GddO_&@krgJBjQ&y6# z{|#5>WY%FY0IKVmow1M9vU5?dgp1zD6}EX+XTZur=mxWBdI5m+#63#kM=GO*q7xe#2RvXa^ zQX~EURPHkYLERZNd+qc84>ad5Czcj4mRY^ehq8Q6KD1KD5|A9+_wi@iuiUrMtGYi_JF6>mt zScG*~*XJ#0E#$$zHEOmC+jiE59m^W}=wvRg$Hf?meMSfYr=iAT%4NL+e_c7qm%7r*f*MY89YNcbOO3|&_5?&bmAj+N9bdekXdk|k1=u0}_Ld)QufaWw5r7{KqWqw21O0mokZ~LX z!sj|dEBp5SiclT6PL{8 zyLbT2*c6Dn5HMi_(;0|O0Qb6-Soxi{@sbAEohr?95yR2Cw1y#+U9AA$QVPCKpXY7PQNsBBhJYuQd+^?l%b7lr#vrryIVU2ewX6+tnV8NsO%J`WLP`MW6b zTtXB>nw|vCfnf*MQC9d87qE`{8o*`&ZZtp$;!%g$JiU-9n~Hr+O^K%IQWB$O@#GLD z*WX5p;=hT)+{PlA2dGG@D(vIL`w9aKFOeEi!V_>eQC}U;<|3a12WKx1J_WFMOM^;a;o0Of+FH9V>bqI(PsXmC*ObS0 zU;(%klxZ1?9QAo0J4nH4Df@7YTE304^4Nqq<4Eq0@aFhB`YBO@K$8ucD&x|)f;uIs z^Os}1!}PI`r|b^(ElyZGo~T=gM2T?^1wmDu0kzK(F{*zheXdLQG$=5j~R-NX_$ z;5;>k>-fN>6E$!S@>JAFRaQ7?bO!her!^Sj;}^xU_p++U>A@3&C?!$O;AZ`0f z(g2cCNj?iDnH_4CQVjYAI!!mVI7_s@#oAX?Spax3nQkzb8$B`Mgo*=yaFNRG-f$2S zs(!%Lp^HfTuH%n0xw(9%MXh3rsJ)LmgnbYX08bse3~wl*$5=9z+pXwIYCC>Bfi#86 z*e{+Mk$@yBXDKGYF~rqiK&?Qb zuhPi)A<3hyQrMHkA;s2mWZep z855H%{KX&dvtX9y_gsFgyYDVi9Qos&mk0^PGS=ROnVX8Gj*mTu@r^#X%8zNn;F?lH zb+ix&8-Bupxlo!unlA<|HUllG=TLYstScN87VRf88Jb;x%svSYU{-2~H6Kr^2;6kw z1L6BNKS78G>H{HO#&FdDiC4vEAIq7En|^6Sh)F-UDd8!R3Sm00)GRKu!aN(#Q}+4! z=Q1o=eqsi9O}2oXIf&1}^9m*Y9eqCURMBGMI5JW+PEt;u#ZywnO5!X+7BSI7HOFGE zmU@*5L(z$oe)%9yP0D`)aB6?hIzS{#ceISF$8mwq#e? zl`X|pUCmS5nx)Co;RISav~pHXmL5p{bxmUWevxm>l9iXG{?uf{L z@vfkqv&Pf@zd%36t%TVx2~4ynT?K80y^QvyWX~^U z{)Y*6qBdTZ=q=7PW6iLc)~tJaAX}H&Wp>HFEZS#KUKuFXRd$t`_EpS13rfyebM1L+ z9(zx@=RTLLs(aqOfcFBcVpf$cpj>b-TH~&Irk+JxX)!vcca^C5rXvG!xK!S2&I-B zNewxY|3rzDpUbkeD@Bs6xDpnW!8Dx6PMp*9n<2Pm+VMDZ+(WzWbpqzvtlbR{ZRRn$ z3?f~?yhf}!d^d>o{R4U*a}+OPF@6EX_wfX@v?Na!`sNGpUK(y&Ck~(=z3ExS6I4)$ z1IPm)o-n)9#98n9=H_9z*YV9au8oRl(Axp81OX(n-3vmq+3qHYn~Vf%zIx4cJizsK z_aL&(plGH)&K~!gT%6H#J1ihMcDh~Wa1!?&hR#Gjn7dTWppMbF{tOz~w!v^;(+N!O zG>KhJ$2ZxNI_nZ^KD(IXADWOYKfETeGusTfx@I#l>yGC!*X;P-Aqw*|20a9>cl=#u zwjI9T>tb1q_c)Nn!D%dHyN4O7ScCKyXCcG9W@x%imkzl{r}PEzK4V>Tw*wTHGO!Iu zlz~h3(Hq1X)u3-)!o9?5JJ>yO^Knj)d7bV~Txg;H@{4X>%&Zfy(vLWWw3xW|S5S$h zmK4bmG@Pq^p&qGdgNFqh5a+5Jcc3rN?BUSRanJIo5|5|-LFn)>bavyh;Y8m>v#KR0 z&`2f5IVTKx5UUtCd7R17bwD>v*#vG0UmFv0f8j-^`zBzmE~EkAq<^!sdT2DIBkjq2 zq^wDg%x&c;cce$UEA1Qn6<4Nuq(!+~u#89f$HfPNGDHg(9z>)Kv?C)j_+DhR6qG6+ zsik5R%IFJ4rrBC6H$u7GMlAJ}nHfR9`W~zA+s{gTRfmj2yYN|{7Ed&TXUWSiqnlr( zFsD`Y`hHNO4a-~?&`g6Yy&2dv1ZxN4WD&-m)k;13JTi~#L9W}g%yychCQiy&T zr$I+*>2aYSI*`2Au!NujlqX3-a_|MJoj44U>HtrmqmYzJuA=m>d}A2PGFT$ zbcae@5je%AlJLgbn`&w z(;?Wh1(6*oZ~6$1a$gs2FvQ|0_q(}IFisByMh*Vy zupAnZlEC>_0apl%4F$0O_s|j{M)~&mgAyOK*2#X+>6yFd>P$?Vs3Vm>pIop&E2N&csBFl zoj}9Jg`L9?HitWhvFnl|?;@JQFO1dt9*>J3lRPCIWB0(94X<+`nw;*psvk3_ zRe(wNgm1{-AjFMqN12^qj<}L?QGE}Hu*Od?KOp9ow4yvI<53KVrJ+wkEEW8nktgIS zQ7@xD2~q7|{SJOgm(EyAucJXmzojJX0oIWA3;bfF9I3wHf@!9RVMxmzUy8EP zSFc_PAgZSU9-k-I$X?&5-K~9U*Y4S?@2}inS*@+#kJVikCiF>+fDqoeKo~jo5JrjU zbCM(zDM?_N;aY+ye~w83`8twZ#)0)b4n6-Jht7brW7CndW0T@cWP1Rf5;Z}9v@VCo zhZQP`v{0Ofwjgg=oQRL3E0z1_9wWQi#IycuOiwfuZngYt+*$(m4En_DLJ@4BGJ<}p z2Kdzohzo@vQRI0fL&HJ=>?{1bs~!m*+|m+ln?Q%TOwhEAzOi<1wf5=ex@qOp8n4nJ z^+XBZMV-HeB33=-Cqy9C^le1Qlq8~xuP|3QnGz^K+)bPfN_?k21Z+kIBEhp{fIb|6 zfrU=I96}F%g9uuvp;v0tfCRGcoB4Ar<~j2Stj zXcWb)IwoX4Qit>l?w~l1G7(`amd0|H&UA*v&#sF2vG`L2cP{MZ=LJNOC6N}A5l2$w zi8%77@T{#dSgkUr+qqNQ$lM~@h4{9FCm`_+C0Jx>ptO_*Y#)V+k=#AM3!#mat_deq z2~~b0)XEaS0X3|+$}K6(@pnaCy(QfB2DsXW^NJm{jXSYU=B^%E6ZMV{3l-X_<`1g* zI2ZJYl?g3D`1t(ly$uB5AFbBz-M4EiKUm+glXyHfM@N!A`|g&#w!V3PeRZX_z824n z&PZU<^YQxqn$Z3C*Y9k_r6i!YDP&DU0RAy{x5m;2XdlB2 zP{ zLp*HGClTW{bLfDWLwDd>R?FZ?Js`2=n&iba{WRLVW*+PzI7L>BV%E%h28bu!Bx24M z;H>L8^>hLA{`%$zE34}t!Zw?$L#tdJ2{`Z65OU@bbx;l_K7~lx>$wQ=QWuFhr$})$ zTr0oMVig?5DOQN}x#KBKLm*H3=(up)% zIh=~LVZ?))Mg|Ev=NoX6b8wIwse^nz)LZ!w{?`D1Pz|{s%g?&op~4 z0N*%C)64}XB?ZGpFwpTykk#=0PMBs>Dcic+MD`=gwJe!6(*{8wqoe1PEG#F*nYMM4ZT$a>|VGS5d^3`#m_LZT29^&4>tf=<*Zu za!USjQf0B8?8J#1POzXzG5;6(gDDh=7i7SvC*waM>)^d9xqtEa#zwUoAy-v4kV4!* zQ8{5tV;v$4BFBYpv%c>+J044u-7ELE;D>7)D<7`&vxL$)6tU4@4)hcY5+l4r19ci0 zOVx0ldEQVJlZCCy`~gi8YMG44Kj0B+nVdNKD=L*Dj&!R3!|h;Ue*T?XZ$I3go7oN? zRUiInJ6OK`NW9*@{b->&j~Q>jM$@+E&^z~NVS6r`dHAE5M+-C2{5zKjkic#OdZAi< z$N4v~HZ>}yn<_7AvdQ>m+oXanxS8>Nh99W7|-;n4ZSsmDQya#VIF_VZZbzp=u^fNa~?QN=Byg`6Jx|0)`zdh3*1 z9LVRSqp@(3+JhwGJWajlrK5^pi7LYp%&K5c0nq<~_Zb&SpW&{cMwxbRAon1wpVT@_ zw~^o^dgyM;ttq!KkiS$o?RGB8BUObQtUrU+`C$uhsdWKZD&qVXqx=xQdTN8KV5X-~ zE>eHC=d&m9Jwmunl>Qg_d;0EZ57XN7RbIwSP9FT%^4!<*%-8bmua)K7v0Ut%EyQS} zo*68s03>~M)4al;lGpU~fAQm=(JE9R;v#o}2ULH_8n?5(?2tVPtqV!Q7)r@L*y|wk zE>^U$Vw~sBfygzs0UPJqPFUZID=YXG!|6kZNKC~l@)dCoc`C-OJe+Gv&hsDAjy^%1 z8p@U(VsU)k2;= zxgrI39MoA;s}xY%2oY*@#yVEPR-DqgNs{{s zEiUhiEOsonEuk317X=MwEab1#%sx%hp--6KiS_@15Q&fNuMQP)6eCJGemxv@Q8SmYLqxgLge44+80zrDB?U~+xPBmty`zO4h%I& zntbX4n{tvsAyDF$1?5KQo24gW<5u#21a8y)2Wu!aRa14ftSDff33UdaUW~#S`MhXN k=O>gic}=0g-EOO|ZQc8^~(oE?*%8D}vw(Rx>7OFQ1!iekz3Z0|Vj7ONz+ z)O@(QX<1xyce8S^L4XNz%{8}N66C)KatILQR+k)-Lk0nI4vYlJ@2e(T5?2D2L{(SU zi}ii=eZTL)PbVe{|NKAy`QpnDHSOQDW1n+q{|F`eFH}r(HKsFT+i>-R3Nx7%8cowR zb?v#$Zk}_`HD}zJ=Bzv0Jnx=w&bf2V zd3U~f!M&j76vKsJnHxj&X!`V*zCq!&aR1e zgGUjqZ$+`_HsYY&lJhvnsUy{#v{shZH>CB|1|5~^V-(23+VYdf57*ZpK3kSEYrcrM zc*1viBj)U}FZQ|}Sv)@7#P5KD%5tmGW;nwG9%F7>NGp8P3>!L8jt@O8l=~>rMO3kp zmbY50_qAQ)P&?Egpd9Mpt=XG-*m})H?D!4~6gj>)sHJf$)SN{eZN~3m1>U&auE#|ODRqwXyJMAW~2Y#y#26u_oQT+jb&BJzw zUg}TdAdKqEo9*`A?X~V^7(_c<+-ApSjE&Vdqr3IZL=x&A(O>Hvyvr|c^A>;85%+G< zfeloa*1PBhipJVv*lze?bg!09!+9;D8!QZ>SZ0ElHzV*^XGf@xbUvMcN=o(^-^dw9ObuCeQQ zUSl`dO+2r&TkL&2Z?F$o70;W$0bku)xb%+%F?pkB(-quCktV7@LvQcKP8@f>sMq)R z_iOu~*4kpb4hgP*{Q2jf)mtO=s_Nnzc4*Kx^r2@ebq-!)CAWvhF68Y{*EPtniDFTk zkqjHXuOIQ)iFdg3CHK2=u+=%!P7pcGV0$NaHo4P{IIhqMeeC44Th9It zZ#fOY{g}46zrOB(n+o}&<5y*gDJ*qh}J0v+Z zGznA}l5TGc9s=ejBpC&K%X-f_ous2Bs|})qKBP?4E}}&DQ1xLzJ*^LOhdZ1nnhvQ4 z@2C|hZPR-u&c?aE2FS9vw7&Tl1XPKEtmehX*hiWX?sl3tJMn=BQ6>}U#r}4b$nD%o z`0yUCK3jgWvHU=0TM@)J64$Xu0!X$8vlPUhHl z_wnkZL?u+3mDO#XuDhW3uAI)B(Ql*|O@f^w4c!*5Q@;SeZO;%l2y@^`YJX~6&~|lZ zG4l{?S_ccrU!W(hJhLijgLnGRzaw|$ojw3aX5wxqMdn-`E-bDoqhQA`i~- z9l#S%di%hEuAKq$BjdWdBCT$`^;xwbvq9v+OAsW0a-owt)sW@Xt4=JWn7|Zqm*$xm zl39g(T+C8WeguP&%qH|hzZ5$Bgofu}M}!x3cq7;fIP*rcKmc4gK%au@zhbsBkYiS&S1!3#hcR4P>>AtWhw_C z*t(&Y@I4D}Pi~c}A!$$r7>B6$TC1ABKfkoGL$ z^1(~AfK|KpZeC$o0qw$Wk&sO%2ckqRrtg{J?eTTT`kx$M@#@KQ(xk%f#33{U;pq1* zEiT8Cab`r=eZYnM3{8`g09oDZT^q7v?Z5C6F)@kf@$Z1R%3X zL7X6M7eA&dhe}%T3z#h2|LW=LA9(8z-DUB88dDCyR9ghwLCX)*TT}Qbtu4-(G(+B! znMT--xSW343Sd(C16WwbmxVx7ReXeKq}x$1iB&DGDU+J?Oyg%psH^17A|oyJ-*P9O zVd7t*MC6`mz*+j*RtcA0!KIfCy84P<*5~!!#nbVN&YQ+$1ek=t>e|a_fZ>`lJP0?y zHhmS?`8oOpIf1}`r4lD?q%w?ui(wTbX?pL{=|8Nv(;z{k4;jMLz+b^RAY2fS0Sw^4 z0%#Hh>aYQK2|y8P!pQGg;x4s!VG&5ATHmDeEb*5Zv1pa@>M8XEkel@sJUxEUZ`E9zbT?V#wBJ9qBX)o8t17MC$uJg17TPi7I5@D@`}sR}ry z)o6DPj`_NDD}sbU&U!$Kt>6uLK1aT9q6?Ed2zx?=E$}R3*wkuK$%VAYf0r2_Nsh?P zCA&`gtfqK`&)2w94d+7wCn$;{Q;i!gQH1#B zL&zz75z2eKSRkhpV{qVbU&dY*Pk`kGXwIwoH2V2`C1^1ftu(qqz}STc4lHEw8ljBh zh%$H-a&Duqbreh1pnTF6NL;`fXNd+5UDhoeOO}&V1k90Q^%%3nOU(IKl*Bh9Rsk6Z zC3@dGT|rO1@surul_^{P3@=~oAW zkxwm%?SfIYN7a~gn%L?;&>zjCB5(#ElYLffx8W9-^|IN!a60p`HVqa|NTBkG$5>4g zIS0I)Q#h#DPY9yWPvN$)`c8LXQHU3!iAolSs*lstVI!LNbZ179*>jo=e_FsVKSoJLh%WH%AFYn?Q2U<)|a1n z_a7~-uZv|2B{2-Cue4T|9xuzurAHgfPga*UmOXl3mlmB;%+Or&=fo7vwYQM)>LSz< z8`P)V0+Ji$1SJ054SKyv73E42$Csk}1UCPOTI75xXHNvwdP&tQswf~)nkq=BMUSdJ zDmgJsPBwTLjuMn4I-^8_${`SYNlPP2H2@|J3+CF=vj}M{3jvK$%GndPZRg8H+lI#I zh3odLZI)_w-mcgg+eFDWur~b=0Q?9gB1Rph&RvV?NS0@KhFcs7bZ&23a~e|U1`_B+ zw?zMLAt60MiE}*5Dp{@}ot}p$To8oHy}ZgzQwXQlc7(MG@W{d?fQOLm7FzI8R5(USCH#Fr#yQD+ z!=Qql!@vs3g&2XG3anTfSQX-8;zgesSRq0vC6M`)C{h8FN*o(;7@_66C5lXJ$+*x?jAm9~)-SLlQ!FgEF#QGt_U8PKCb)j^`$ z3xfu4MFdbe&%-~TXiPTRA*IOyw^7aCY^eOHEIw>@+9FP3Sh_Y@JT^}nK|6`CDYzEr zsZ!cd#Q$xzVX9J@7G-FKiaiu9l&Epltl__M3E7GPbZiA`i3pveS3)~kpk0~YYzDDP z=_KrXkH+c#mHM8>5os?bfP9apMb}VSrUCe&kh-E@Q6Y6nU&!6CO9t3vReq@j{|~NY Bglqr+ literal 0 HcmV?d00001 diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py new file mode 100644 index 0000000..090bb61 --- /dev/null +++ b/bs4/builder/_html5lib.py @@ -0,0 +1,426 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +import re +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, + prefixes, + ) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError as e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + + extra_kwargs = dict() + if not isinstance(markup, str): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, str): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '%s' % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s" % (' ' * indent, name)) + else: + rv.append("|%s" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in list(element.attrs.items()): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = nonwhitespace_re.findall(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(treebuilder_base.Node): + def __init__(self, element, soup, namespace): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, str): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + node.parent = self + else: + child = node.element + node.parent = self + + if not isinstance(child, str) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + def setAttributes(self, attributes): + + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in list(attributes.items()): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element is not None: + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/bs4/builder/_html5lib.py.bak b/bs4/builder/_html5lib.py.bak new file mode 100644 index 0000000..6892a93 --- /dev/null +++ b/bs4/builder/_html5lib.py.bak @@ -0,0 +1,426 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +import re +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + nonwhitespace_re, +) +import html5lib +from html5lib.constants import ( + namespaces, + prefixes, + ) +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, unicode): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + namespaceHTMLElements, self.soup) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return treebuilder_base.TreeBuilder.getFragment(self).element + + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s" % (' ' * indent, name)) + else: + rv.append("|%s" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = self.element.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = nonwhitespace_re.findall(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(treebuilder_base.Node): + def __init__(self, element, soup, namespace): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, basestring): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + node.parent = self + else: + child = node.element + node.parent = self + + if not isinstance(child, basestring) and child.parent is not None: + node.element.extract() + + if (string_child is not None and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, basestring): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + if isinstance(self.element, Comment): + return {} + return AttrList(self.element) + + def setAttributes(self, attributes): + + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) + if insertBefore: + self.insertBefore(text, insertBefore) + else: + self.appendChild(text) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element + + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant is not None: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant is not None: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child is not None: + new_parents_last_child.next_sibling = first_child + + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element is not None: + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..ea549c3 --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,350 @@ +# encoding: utf-8 +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from html.parser import HTMLParser + +try: + from html.parser import HTMLParseError +except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # . + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): + # XXX namespace + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + attr_dict[key] = value + attrvalue = '""' + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # .) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError as e: + pass + if not data: + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + pass + data = data or "\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "" + data = '' + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] + + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: + parser_kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, str): + yield (markup, None, None, False) + return + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + parser.close() + except HTMLParseError as e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + parser.already_closed_empty_element = [] + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like

as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/bs4/builder/_htmlparser.py.bak b/bs4/builder/_htmlparser.py.bak new file mode 100644 index 0000000..56b8b91 --- /dev/null +++ b/bs4/builder/_htmlparser.py.bak @@ -0,0 +1,350 @@ +# encoding: utf-8 +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def error(self, msg): + """In Python 3, HTMLParser subclasses must implement error(), although this + requirement doesn't appear to be documented. + + In Python 2, HTMLParser implements error() as raising an exception. + + In any event, this method is called only on very strange markup and our best strategy + is to pretend it didn't happen and keep going. + """ + warnings.warn(msg) + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # . + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): + # XXX namespace + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + attr_dict[key] = value + attrvalue = '""' + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # .) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) + + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + data = None + if real_name < 256: + # HTML numeric entities are supposed to reference Unicode + # code points, but sometimes they reference code points in + # some other encoding (ahem, Windows-1252). E.g. “ + # instead of É for LEFT DOUBLE QUOTATION MARK. This + # code tries to detect this situation and compensate. + for encoding in (self.soup.original_encoding, 'windows-1252'): + if not encoding: + continue + try: + data = bytearray([real_name]).decode(encoding) + except UnicodeDecodeError, e: + pass + if not data: + try: + data = unichr(real_name) + except (ValueError, OverflowError), e: + pass + data = data or u"\N{REPLACEMENT CHARACTER}" + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + # If this were XML, it would be ambiguous whether "&foo" + # was an character entity reference with a missing + # semicolon or the literal string "&foo". Since this is + # HTML, we have a complete list of all character entity references, + # and this one wasn't found, so assume it's the literal string "&foo". + data = "&%s" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "" + data = '' + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] + + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: + parser_kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, unicode): + yield (markup, None, None, False) + return + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + parser.close() + except HTMLParseError, e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + parser.already_closed_empty_element = [] + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like

as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py new file mode 100644 index 0000000..a490e23 --- /dev/null +++ b/bs4/builder/_lxml.py @@ -0,0 +1,296 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable + +from io import BytesIO +from io import StringIO +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in list(d.items())) + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in list(mapping.items()): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + + def default_parser(self, encoding): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser + + def __init__(self, parser=None, empty_element_tags=None, **kwargs): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """ + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. + """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, str): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(str(e)) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in list(attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + processing_instruction_class = ProcessingInstruction + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(str(e)) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '%s' % fragment diff --git a/bs4/builder/_lxml.py.bak b/bs4/builder/_lxml.py.bak new file mode 100644 index 0000000..27cadcb --- /dev/null +++ b/bs4/builder/_lxml.py.bak @@ -0,0 +1,296 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable + +from io import BytesIO +from StringIO import StringIO +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in d.items()) + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in mapping.items(): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value + + def default_parser(self, encoding): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser + + def __init__(self, parser=None, empty_element_tags=None, **kwargs): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """ + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. + """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, unicode): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, unicode): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, unicode): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(str(e)) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + processing_instruction_class = ProcessingInstruction + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(str(e)) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment diff --git a/bs4/dammit.py b/bs4/dammit.py new file mode 100644 index 0000000..c7ac4d4 --- /dev/null +++ b/bs4/dammit.py @@ -0,0 +1,850 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. +""" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import codecs +from html.entities import codepoint2name +import re +import logging +import string + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = chr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a tag, hopefully near the + beginning of the document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + declared_encoding = None + declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii', 'replace') + if declared_encoding: + return declared_encoding.lower() + return None + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[]): + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, str) or markup == '': + self.markup = markup + self.unicode_markup = str(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + self.log.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER." + ) + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + return str(data, encoding, errors) + + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset): + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', #   + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # ­ + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/bs4/dammit.py.bak b/bs4/dammit.py.bak new file mode 100644 index 0000000..08109f2 --- /dev/null +++ b/bs4/dammit.py.bak @@ -0,0 +1,850 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. +""" +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import codecs +from htmlentitydefs import codepoint2name +import re +import logging +import string + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + + # &apos is an XHTML entity and an HTML 5, but not an HTML 4 + # entity. We don't want to use it, but we want to recognize it on the way in. + # + # TODO: Ideally we would be able to recognize all HTML 5 named + # entities, but that's a little tricky. + extra = [(39, 'apos')] + for codepoint, name in list(codepoint2name.items()) + extra: + character = unichr(codepoint) + if codepoint not in (34, 39): + # There's no point in turning the quotation mark into + # " or the single quote into ', unless it + # happens within an attribute value, which is handled + # elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to recognize those entities on the way in and + # convert them to Unicode characters. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if isinstance(data, unicode): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a tag, hopefully near the + beginning of the document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + declared_encoding = None + declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii', 'replace') + if declared_encoding: + return declared_encoding.lower() + return None + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[]): + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, unicode) or markup == '': + self.markup = markup + self.unicode_markup = unicode(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + self.log.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER." + ) + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + return unicode(data, encoding, errors) + + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset): + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', #   + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # ­ + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/bs4/diagnose.py b/bs4/diagnose.py new file mode 100644 index 0000000..a1ae23d --- /dev/null +++ b/bs4/diagnose.py @@ -0,0 +1,224 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import cProfile +from io import StringIO +from html.parser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print(( + "I noticed that %s is not installed. Installing it may help." % + name)) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError as e: + print ( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print("Found html5lib version %s" % html5lib.__version__) + except ImportError as e: + print ( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + else: + try: + if os.path.exists(data): + print('"%s" looks like a filename. Reading data from the file.' % data) + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print() + + for parser in basic_parsers: + print("Trying to parse your markup with %s" % parser) + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True + except Exception as e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) + + print("-" * 80) + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print(("%s, %4s, %s" % (event, element.tag, element.text))) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("" % tag_name) + return "" + "\n".join(elements) + "" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) + data = rdoc(num_elements) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception as e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print("Raw lxml parsed the markup in %.2fs." % (b-a)) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) + +def profile(num_elements=100000, parser="lxml"): + + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/bs4/diagnose.py.bak b/bs4/diagnose.py.bak new file mode 100644 index 0000000..f9835c3 --- /dev/null +++ b/bs4/diagnose.py.bak @@ -0,0 +1,224 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import cProfile +from StringIO import StringIO +from HTMLParser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + print "Diagnostic running on Beautiful Soup %s" % __version__ + print "Python version %s" % sys.version + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print ( + "I noticed that %s is not installed. Installing it may help." % + name) + + if 'lxml' in basic_parsers: + basic_parsers.append("lxml-xml") + try: + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + except ImportError, e: + print ( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + except ImportError, e: + print ( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif data.startswith("http:") or data.startswith("https:"): + print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data + print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + return + else: + try: + if os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + with open(data) as fp: + data = fp.read() + except ValueError: + # This can happen on some platforms when the 'filename' is + # too long. Assume it's data and not a filename. + pass + print + + for parser in basic_parsers: + print "Trying to parse your markup with %s" % parser + success = False + try: + soup = BeautifulSoup(data, features=parser) + success = True + except Exception, e: + print "%s could not parse the markup." % parser + traceback.print_exc() + if success: + print "Here's what %s did with the markup:" % parser + print soup.prettify() + + print "-" * 80 + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print("%s, %4s, %s" % (event, element.tag, element.text)) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("" % tag_name) + return "" + "\n".join(elements) + "" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + data = rdoc(num_elements) + print "Generated a large invalid HTML document (%d bytes)." % len(data) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception, e: + print "%s could not parse the markup." % parser + traceback.print_exc() + if success: + print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print "Raw lxml parsed the markup in %.2fs." % (b-a) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print "Raw html5lib parsed the markup in %.2fs." % (b-a) + +def profile(num_elements=100000, parser="lxml"): + + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/bs4/element.py b/bs4/element.py new file mode 100644 index 0000000..f16b166 --- /dev/null +++ b/bs4/element.py @@ -0,0 +1,1579 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +try: + from collections.abc import Callable # Python 3.6 +except ImportError as e: + from collections import Callable +import re +import sys +import warnings +try: + import soupsieve +except ImportError as e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(str): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = str.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = str.__new__(cls, name) + else: + obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(str): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return str.__new__(str, original_value) + + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s, formatter): + """Format the given string using the given formatter.""" + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look up + an XMLFormatter or HTMLFormatter in the appropriate registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if callable(formatter): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree.") + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract() + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + my_parent = self.parent + if self.parent is None: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent is not None: + del self.parent.contents[self.parent.index(self)] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if (self.previous_element is not None and + self.previous_element is not next_element): + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if (self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling): + self.previous_sibling.next_sibling = self.next_sibling + if (self.next_sibling is not None + and self.next_sibling is not self.previous_sibling): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self, is_initialized=True, accept_self=True): + "Finds the last element beneath this object to be parsed." + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + if new_child is None: + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, str) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant(False) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def extend(self, tags): + """Appends the given tags to the contents of this tag.""" + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + The elements will have the same parent, and the given elements + will be immediately before this one. + """ + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements + will be immediately after this one. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1, **kwargs) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + + if isinstance(name, SoupStrainer): + strainer = name + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + + if text is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) + elif isinstance(name, str): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) + else: + prefix = None + local_name = name + result = (element for element in generator + if isinstance(element, Tag) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) + ) + return ResultSet(strainer, result) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + i = self.parent + while i is not None: + yield i + i = i.parent + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(str, PageElement): + + PREFIX = '' + SUFFIX = '' + + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, str): + u = str.__new__(cls, value) + else: + u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + + def __copy__(self): + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) + + def __getnewargs__(self): + return (str(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter.""" + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self): + return None + + @name.setter + def name(self, name): + raise AttributeError("A NavigableString cannot be given a name.") + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ + + def output_ready(self, formatter=None): + """CData strings are passed into the formatter, purely + for any side effects. The return value is ignored. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + + PREFIX = '' + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX = '' + +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = '' + +class Comment(PreformattedString): + + PREFIX = '' + + +class Declaration(PreformattedString): + PREFIX = '' + + +class Doctype(PreformattedString): + + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name or '' + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = '\n' + + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None): + "Basic constructor." + + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if attrs is None: + attrs = {} + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + else: + attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + if builder is None: + # In the absence of a TreeBuilder, assume this tag is nothing + # special. + self.can_be_empty_element = False + self.cdata_list_attributes = None + else: + # Set up any substitutions for this tag, such as the charset in a META tag. + builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. + self.can_be_empty_element = builder.can_be_empty_element(name) + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + parserClass = _alias("parser_class") # BS3 + + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)(None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this tag. + + :Return: If this tag has a single string child, return value + is that string. If this tag has no children, or more than one + child, return value is None. If this tag has one child tag, + return value is the 'string' attribute of the child tag, + recursively. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + By default, yields only NavigableString and CData objects. So + no comments, processing instructions, etc. + """ + for descendant in self.descendants: + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + + strings = property(_all_strings) + + @property + def stripped_strings(self): + for string in self._all_strings(True): + yield string + + def get_text(self, separator="", strip=False, + types=(NavigableString, CData)): + """ + Get all child strings, concatenated using the given separator. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next_element + i.__dict__.clear() + i.contents = [] + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def smooth(self): + """Smooth out this element's children by consolidating consecutive strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list.""" + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + + def has_attr(self, key): + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __bool__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag == "contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding="unicode-escape"): + """Renders this tag as a string.""" + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) + + def __unicode__(self): + return self.decode() + + def __str__(self): + if PY3K: + return self.decode() + else: + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ + + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, str): + val = str(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + str(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = formatter.void_element_close_prefix or '' + else: + closeTag = '' % (prefix, self.name) + + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) + if pretty_print: + space = indent_space + indent_contents = indent_level + 1 + else: + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter + ) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. + s.append("\n") + s = ''.join(s) + return s + + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) + + def prettify(self, encoding=None, formatter="minimal"): + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. decode_contents() is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + """ + # First off, turn a string formatter into a Formatter object. This + # will stop the lookup from happening over and over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: + text = text.strip() + if text: + if pretty_print and not preserve_whitespace: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print and not preserve_whitespace: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. + """ + + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # CSS selector code + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element.""" + value = self.select(selector, namespaces, 1, **kwargs) + if value: + return value[0] + return None + + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. + + This uses the SoupSieve library. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + return soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.descendants + + def has_key(self, key): + """This was kind of misleading because has_key() (attributes) + was different from __in__ (contents). has_key() is gone in + Python 3, anyway.""" + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in list(attrs.items()): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, str)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return str(str(value)) + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + call_function_with_tag_data = ( + isinstance(self.name, Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + searchTag = search_tag + + def search(self, markup): + # print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, str): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against, already_tried=None): + # print u"Matching %s against %s" % (markup, match_against) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + original_markup = markup + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, str)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, str): + # Exact string match + match = markup == match_against + + if not match and hasattr(match_against, 'search'): + # Regexp match + return match_against.search(markup) + + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source, result=()): + super(ResultSet, self).__init__(result) + self.source = source + + def __getattr__(self, key): + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + ) diff --git a/bs4/element.py.bak b/bs4/element.py.bak new file mode 100644 index 0000000..73e3867 --- /dev/null +++ b/bs4/element.py.bak @@ -0,0 +1,1579 @@ +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +try: + from collections.abc import Callable # Python 3.6 +except ImportError , e: + from collections import Callable +import re +import sys +import warnings +try: + import soupsieve +except ImportError, e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + +from bs4.formatter import ( + Formatter, + HTMLFormatter, + XMLFormatter, +) + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. +whitespace_re = re.compile(r"\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(unicode): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = unicode.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = unicode.__new__(cls, name) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(unicode): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return unicode.__new__(unicode, original_value) + + obj = unicode.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + + +class PageElement(object): + """Contains the navigational information for some part of the page + (either a tag or a piece of text)""" + + def setup(self, parent=None, previous_element=None, next_element=None, + previous_sibling=None, next_sibling=None): + """Sets up the initial relations between this element and + other elements.""" + self.parent = parent + + self.previous_element = previous_element + if previous_element is not None: + self.previous_element.next_element = self + + self.next_element = next_element + if self.next_element is not None: + self.next_element.previous_element = self + + self.next_sibling = next_sibling + if self.next_sibling is not None: + self.next_sibling.previous_sibling = self + + if (previous_sibling is None + and self.parent is not None and self.parent.contents): + previous_sibling = self.parent.contents[-1] + + self.previous_sibling = previous_sibling + if previous_sibling is not None: + self.previous_sibling.next_sibling = self + + def format_string(self, s, formatter): + """Format the given string using the given formatter.""" + if formatter is None: + return s + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + output = formatter.substitute(s) + return output + + def formatter_for_name(self, formatter): + """Look up or create a Formatter for the given identifier, + if necessary. + + :param formatter: Can be a Formatter object (used as-is), a + function (used as the entity substitution hook for an + XMLFormatter or HTMLFormatter), or a string (used to look up + an XMLFormatter or HTMLFormatter in the appropriate registry. + """ + if isinstance(formatter, Formatter): + return formatter + if self._is_xml: + c = XMLFormatter + else: + c = HTMLFormatter + if callable(formatter): + return c(entity_substitution=formatter) + return c.REGISTRY[formatter] + + @property + def _is_xml(self): + """Is this element part of an XML tree or an HTML tree? + + This is used in formatter_for_name, when deciding whether an + XMLFormatter or HTMLFormatter is more appropriate. It can be + inefficient, but it should be called very rarely. + """ + if self.known_xml is not None: + # Most of the time we will have determined this when the + # document is parsed. + return self.known_xml + + # Otherwise, it's likely that this element was created by + # direct invocation of the constructor from within the user's + # Python code. + if self.parent is None: + # This is the top-level object. It should have .known_xml set + # from tree creation. If not, take a guess--BS is usually + # used on HTML markup. + return getattr(self, 'is_xml', False) + return self.parent._is_xml + + nextSibling = _alias("next_sibling") # BS3 + previousSibling = _alias("previous_sibling") # BS3 + + def replace_with(self, replace_with): + if self.parent is None: + raise ValueError( + "Cannot replace one element with another when the " + "element to be replaced is not part of a tree.") + if replace_with is self: + return + if replace_with is self.parent: + raise ValueError("Cannot replace a Tag with its parent.") + old_parent = self.parent + my_index = self.parent.index(self) + self.extract() + old_parent.insert(my_index, replace_with) + return self + replaceWith = replace_with # BS3 + + def unwrap(self): + my_parent = self.parent + if self.parent is None: + raise ValueError( + "Cannot replace an element with its contents when that" + "element is not part of a tree.") + my_index = self.parent.index(self) + self.extract() + for child in reversed(self.contents[:]): + my_parent.insert(my_index, child) + return self + replace_with_children = unwrap + replaceWithChildren = unwrap # BS3 + + def wrap(self, wrap_inside): + me = self.replace_with(wrap_inside) + wrap_inside.append(me) + return wrap_inside + + def extract(self): + """Destructively rips this element out of the tree.""" + if self.parent is not None: + del self.parent.contents[self.parent.index(self)] + + #Find the two elements that would be next to each other if + #this element (and any children) hadn't been parsed. Connect + #the two. + last_child = self._last_descendant() + next_element = last_child.next_element + + if (self.previous_element is not None and + self.previous_element is not next_element): + self.previous_element.next_element = next_element + if next_element is not None and next_element is not self.previous_element: + next_element.previous_element = self.previous_element + self.previous_element = None + last_child.next_element = None + + self.parent = None + if (self.previous_sibling is not None + and self.previous_sibling is not self.next_sibling): + self.previous_sibling.next_sibling = self.next_sibling + if (self.next_sibling is not None + and self.next_sibling is not self.previous_sibling): + self.next_sibling.previous_sibling = self.previous_sibling + self.previous_sibling = self.next_sibling = None + return self + + def _last_descendant(self, is_initialized=True, accept_self=True): + "Finds the last element beneath this object to be parsed." + if is_initialized and self.next_sibling is not None: + last_child = self.next_sibling.previous_element + else: + last_child = self + while isinstance(last_child, Tag) and last_child.contents: + last_child = last_child.contents[-1] + if not accept_self and last_child is self: + last_child = None + return last_child + # BS3: Not part of the API! + _lastRecursiveChild = _last_descendant + + def insert(self, position, new_child): + if new_child is None: + raise ValueError("Cannot insert None into a tag.") + if new_child is self: + raise ValueError("Cannot insert a tag into itself.") + if (isinstance(new_child, basestring) + and not isinstance(new_child, NavigableString)): + new_child = NavigableString(new_child) + + from bs4 import BeautifulSoup + if isinstance(new_child, BeautifulSoup): + # We don't want to end up with a situation where one BeautifulSoup + # object contains another. Insert the children one at a time. + for subchild in list(new_child.contents): + self.insert(position, subchild) + position += 1 + return + position = min(position, len(self.contents)) + if hasattr(new_child, 'parent') and new_child.parent is not None: + # We're 'inserting' an element that's already one + # of this object's children. + if new_child.parent is self: + current_index = self.index(new_child) + if current_index < position: + # We're moving this element further down the list + # of this object's children. That means that when + # we extract this element, our target index will + # jump down one. + position -= 1 + new_child.extract() + + new_child.parent = self + previous_child = None + if position == 0: + new_child.previous_sibling = None + new_child.previous_element = self + else: + previous_child = self.contents[position - 1] + new_child.previous_sibling = previous_child + new_child.previous_sibling.next_sibling = new_child + new_child.previous_element = previous_child._last_descendant(False) + if new_child.previous_element is not None: + new_child.previous_element.next_element = new_child + + new_childs_last_element = new_child._last_descendant(False) + + if position >= len(self.contents): + new_child.next_sibling = None + + parent = self + parents_next_sibling = None + while parents_next_sibling is None and parent is not None: + parents_next_sibling = parent.next_sibling + parent = parent.parent + if parents_next_sibling is not None: + # We found the element that comes next in the document. + break + if parents_next_sibling is not None: + new_childs_last_element.next_element = parents_next_sibling + else: + # The last element of this tag is the last element in + # the document. + new_childs_last_element.next_element = None + else: + next_child = self.contents[position] + new_child.next_sibling = next_child + if new_child.next_sibling is not None: + new_child.next_sibling.previous_sibling = new_child + new_childs_last_element.next_element = next_child + + if new_childs_last_element.next_element is not None: + new_childs_last_element.next_element.previous_element = new_childs_last_element + self.contents.insert(position, new_child) + + def append(self, tag): + """Appends the given tag to the contents of this tag.""" + self.insert(len(self.contents), tag) + + def extend(self, tags): + """Appends the given tags to the contents of this tag.""" + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. + + The elements will have the same parent, and the given elements + will be immediately before this one. + """ + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'before' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements + will be immediately after this one. + """ + # Do all error checking before modifying the tree. + parent = self.parent + if parent is None: + raise ValueError( + "Element has no parent, so 'after' has no meaning.") + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 + + def find_next(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears after this Tag in the document.""" + return self._find_one(self.find_all_next, name, attrs, text, **kwargs) + findNext = find_next # BS3 + + def find_all_next(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.next_elements, + **kwargs) + findAllNext = find_all_next # BS3 + + def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears after this Tag in the document.""" + return self._find_one(self.find_next_siblings, name, attrs, text, + **kwargs) + findNextSibling = find_next_sibling # BS3 + + def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear after this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.next_siblings, **kwargs) + findNextSiblings = find_next_siblings # BS3 + fetchNextSiblings = find_next_siblings # BS2 + + def find_previous(self, name=None, attrs={}, text=None, **kwargs): + """Returns the first item that matches the given criteria and + appears before this Tag in the document.""" + return self._find_one( + self.find_all_previous, name, attrs, text, **kwargs) + findPrevious = find_previous # BS3 + + def find_all_previous(self, name=None, attrs={}, text=None, limit=None, + **kwargs): + """Returns all items that match the given criteria and appear + before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, self.previous_elements, + **kwargs) + findAllPrevious = find_all_previous # BS3 + fetchPrevious = find_all_previous # BS2 + + def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): + """Returns the closest sibling to this Tag that matches the + given criteria and appears before this Tag in the document.""" + return self._find_one(self.find_previous_siblings, name, attrs, text, + **kwargs) + findPreviousSibling = find_previous_sibling # BS3 + + def find_previous_siblings(self, name=None, attrs={}, text=None, + limit=None, **kwargs): + """Returns the siblings of this Tag that match the given + criteria and appear before this Tag in the document.""" + return self._find_all(name, attrs, text, limit, + self.previous_siblings, **kwargs) + findPreviousSiblings = find_previous_siblings # BS3 + fetchPreviousSiblings = find_previous_siblings # BS2 + + def find_parent(self, name=None, attrs={}, **kwargs): + """Returns the closest parent of this Tag that matches the given + criteria.""" + # NOTE: We can't use _find_one because findParents takes a different + # set of arguments. + r = None + l = self.find_parents(name, attrs, 1, **kwargs) + if l: + r = l[0] + return r + findParent = find_parent # BS3 + + def find_parents(self, name=None, attrs={}, limit=None, **kwargs): + """Returns the parents of this Tag that match the given + criteria.""" + + return self._find_all(name, attrs, None, limit, self.parents, + **kwargs) + findParents = find_parents # BS3 + fetchParents = find_parents # BS2 + + @property + def next(self): + return self.next_element + + @property + def previous(self): + return self.previous_element + + #These methods do the real heavy lifting. + + def _find_one(self, method, name, attrs, text, **kwargs): + r = None + l = method(name, attrs, text, 1, **kwargs) + if l: + r = l[0] + return r + + def _find_all(self, name, attrs, text, limit, generator, **kwargs): + "Iterates over a generator looking for things that match." + + if text is None and 'string' in kwargs: + text = kwargs['string'] + del kwargs['string'] + + if isinstance(name, SoupStrainer): + strainer = name + else: + strainer = SoupStrainer(name, attrs, text, **kwargs) + + if text is None and not limit and not attrs and not kwargs: + if name is True or name is None: + # Optimization to find all tags. + result = (element for element in generator + if isinstance(element, Tag)) + return ResultSet(strainer, result) + elif isinstance(name, basestring): + # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. If this is a namespace-aware document, + # we need to match the local name against tag.name. If not, + # we need to match the fully-qualified name against tag.name. + prefix, local_name = name.split(':', 1) + else: + prefix = None + local_name = name + result = (element for element in generator + if isinstance(element, Tag) + and ( + element.name == name + ) or ( + element.name == local_name + and (prefix is None or element.prefix == prefix) + ) + ) + return ResultSet(strainer, result) + results = ResultSet(strainer) + while True: + try: + i = next(generator) + except StopIteration: + break + if i: + found = strainer.search(i) + if found: + results.append(found) + if limit and len(results) >= limit: + break + return results + + #These generators can be used to navigate starting from both + #NavigableStrings and Tags. + @property + def next_elements(self): + i = self.next_element + while i is not None: + yield i + i = i.next_element + + @property + def next_siblings(self): + i = self.next_sibling + while i is not None: + yield i + i = i.next_sibling + + @property + def previous_elements(self): + i = self.previous_element + while i is not None: + yield i + i = i.previous_element + + @property + def previous_siblings(self): + i = self.previous_sibling + while i is not None: + yield i + i = i.previous_sibling + + @property + def parents(self): + i = self.parent + while i is not None: + yield i + i = i.parent + + # Old non-property versions of the generators, for backwards + # compatibility with BS3. + def nextGenerator(self): + return self.next_elements + + def nextSiblingGenerator(self): + return self.next_siblings + + def previousGenerator(self): + return self.previous_elements + + def previousSiblingGenerator(self): + return self.previous_siblings + + def parentGenerator(self): + return self.parents + + +class NavigableString(unicode, PageElement): + + PREFIX = '' + SUFFIX = '' + + # We can't tell just by looking at a string whether it's contained + # in an XML document or an HTML document. + + known_xml = None + + def __new__(cls, value): + """Create a new NavigableString. + + When unpickling a NavigableString, this method is called with + the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be + passed in to the superclass's __new__ or the superclass won't know + how to handle non-ASCII characters. + """ + if isinstance(value, unicode): + u = unicode.__new__(cls, value) + else: + u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u.setup() + return u + + def __copy__(self): + """A copy of a NavigableString has the same contents and class + as the original, but it is not connected to the parse tree. + """ + return type(self)(self) + + def __getnewargs__(self): + return (unicode(self),) + + def __getattr__(self, attr): + """text.string gives you text. This is for backwards + compatibility for Navigable*String, but for CData* it lets you + get the string without the CData wrapper.""" + if attr == 'string': + return self + else: + raise AttributeError( + "'%s' object has no attribute '%s'" % ( + self.__class__.__name__, attr)) + + def output_ready(self, formatter="minimal"): + """Run the string through the provided formatter.""" + output = self.format_string(self, formatter) + return self.PREFIX + output + self.SUFFIX + + @property + def name(self): + return None + + @name.setter + def name(self, name): + raise AttributeError("A NavigableString cannot be given a name.") + +class PreformattedString(NavigableString): + """A NavigableString not subject to the normal formatting rules. + + The string will be passed into the formatter (to trigger side effects), + but the return value will be ignored. + """ + + def output_ready(self, formatter=None): + """CData strings are passed into the formatter, purely + for any side effects. The return value is ignored. + """ + if formatter is not None: + ignore = self.format_string(self, formatter) + return self.PREFIX + self + self.SUFFIX + +class CData(PreformattedString): + + PREFIX = u'' + +class ProcessingInstruction(PreformattedString): + """A SGML processing instruction.""" + + PREFIX = u'' + +class XMLProcessingInstruction(ProcessingInstruction): + """An XML processing instruction.""" + PREFIX = u'' + +class Comment(PreformattedString): + + PREFIX = u'' + + +class Declaration(PreformattedString): + PREFIX = u'' + + +class Doctype(PreformattedString): + + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name or '' + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' "%s"' % system_id + elif system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + + PREFIX = u'\n' + + +class Tag(PageElement): + + """Represents a found HTML tag with its attributes and contents.""" + + def __init__(self, parser=None, builder=None, name=None, namespace=None, + prefix=None, attrs=None, parent=None, previous=None, + is_xml=None): + "Basic constructor." + + if parser is None: + self.parser_class = None + else: + # We don't actually store the parser object: that lets extracted + # chunks be garbage-collected. + self.parser_class = parser.__class__ + if name is None: + raise ValueError("No value provided for new tag's name.") + self.name = name + self.namespace = namespace + self.prefix = prefix + if attrs is None: + attrs = {} + elif attrs: + if builder is not None and builder.cdata_list_attributes: + attrs = builder._replace_cdata_list_attribute_values( + self.name, attrs) + else: + attrs = dict(attrs) + else: + attrs = dict(attrs) + + # If possible, determine ahead of time whether this tag is an + # XML tag. + if builder: + self.known_xml = builder.is_xml + else: + self.known_xml = is_xml + self.attrs = attrs + self.contents = [] + self.setup(parent, previous) + self.hidden = False + + if builder is None: + # In the absence of a TreeBuilder, assume this tag is nothing + # special. + self.can_be_empty_element = False + self.cdata_list_attributes = None + else: + # Set up any substitutions for this tag, such as the charset in a META tag. + builder.set_up_substitutions(self) + + # Ask the TreeBuilder whether this tag might be an empty-element tag. + self.can_be_empty_element = builder.can_be_empty_element(name) + + # Keep track of the list of attributes of this tag that + # might need to be treated as a list. + # + # For performance reasons, we store the whole data structure + # rather than asking the question of every tag. Asking would + # require building a new data structure every time, and + # (unlike can_be_empty_element), we almost never need + # to check this. + self.cdata_list_attributes = builder.cdata_list_attributes + + # Keep track of the names that might cause this tag to be treated as a + # whitespace-preserved tag. + self.preserve_whitespace_tags = builder.preserve_whitespace_tags + + parserClass = _alias("parser_class") # BS3 + + def __copy__(self): + """A copy of a Tag is a new Tag, unconnected to the parse tree. + Its contents are a copy of the old Tag's contents. + """ + clone = type(self)(None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + for child in self.contents: + clone.append(child.__copy__()) + return clone + + @property + def is_empty_element(self): + """Is this tag an empty-element tag? (aka a self-closing tag) + + A tag that has contents is never an empty-element tag. + + A tag that has no contents may or may not be an empty-element + tag. It depends on the builder used to create the tag. If the + builder has a designated list of empty-element tags, then only + a tag whose name shows up in that list is considered an + empty-element tag. + + If the builder has no designated list of empty-element tags, + then any tag with no contents is an empty-element tag. + """ + return len(self.contents) == 0 and self.can_be_empty_element + isSelfClosing = is_empty_element # BS3 + + @property + def string(self): + """Convenience property to get the single string within this tag. + + :Return: If this tag has a single string child, return value + is that string. If this tag has no children, or more than one + child, return value is None. If this tag has one child tag, + return value is the 'string' attribute of the child tag, + recursively. + """ + if len(self.contents) != 1: + return None + child = self.contents[0] + if isinstance(child, NavigableString): + return child + return child.string + + @string.setter + def string(self, string): + self.clear() + self.append(string.__class__(string)) + + def _all_strings(self, strip=False, types=(NavigableString, CData)): + """Yield all strings of certain classes, possibly stripping them. + + By default, yields only NavigableString and CData objects. So + no comments, processing instructions, etc. + """ + for descendant in self.descendants: + if ( + (types is None and not isinstance(descendant, NavigableString)) + or + (types is not None and type(descendant) not in types)): + continue + if strip: + descendant = descendant.strip() + if len(descendant) == 0: + continue + yield descendant + + strings = property(_all_strings) + + @property + def stripped_strings(self): + for string in self._all_strings(True): + yield string + + def get_text(self, separator=u"", strip=False, + types=(NavigableString, CData)): + """ + Get all child strings, concatenated using the given separator. + """ + return separator.join([s for s in self._all_strings( + strip, types=types)]) + getText = get_text + text = property(get_text) + + def decompose(self): + """Recursively destroys the contents of this tree.""" + self.extract() + i = self + while i is not None: + next = i.next_element + i.__dict__.clear() + i.contents = [] + i = next + + def clear(self, decompose=False): + """ + Extract all children. If decompose is True, decompose instead. + """ + if decompose: + for element in self.contents[:]: + if isinstance(element, Tag): + element.decompose() + else: + element.extract() + else: + for element in self.contents[:]: + element.extract() + + def smooth(self): + """Smooth out this element's children by consolidating consecutive strings. + + This makes pretty-printed output look more natural following a + lot of operations that modified the tree. + """ + # Mark the first position of every pair of children that need + # to be consolidated. Do this rather than making a copy of + # self.contents, since in most cases very few strings will be + # affected. + marked = [] + for i, a in enumerate(self.contents): + if isinstance(a, Tag): + # Recursively smooth children. + a.smooth() + if i == len(self.contents)-1: + # This is the last item in .contents, and it's not a + # tag. There's no chance it needs any work. + continue + b = self.contents[i+1] + if (isinstance(a, NavigableString) + and isinstance(b, NavigableString) + and not isinstance(a, PreformattedString) + and not isinstance(b, PreformattedString) + ): + marked.append(i) + + # Go over the marked positions in reverse order, so that + # removing items from .contents won't affect the remaining + # positions. + for i in reversed(marked): + a = self.contents[i] + b = self.contents[i+1] + b.extract() + n = NavigableString(a+b) + a.replace_with(n) + + def index(self, element): + """ + Find the index of a child by identity, not value. Avoids issues with + tag.contents.index(element) getting the index of equal elements. + """ + for i, child in enumerate(self.contents): + if child is element: + return i + raise ValueError("Tag.index: element not in tag") + + def get(self, key, default=None): + """Returns the value of the 'key' attribute for the tag, or + the value given for 'default' if it doesn't have that + attribute.""" + return self.attrs.get(key, default) + + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list.""" + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + + def has_attr(self, key): + return key in self.attrs + + def __hash__(self): + return str(self).__hash__() + + def __getitem__(self, key): + """tag[key] returns the value of the 'key' attribute for the tag, + and throws an exception if it's not there.""" + return self.attrs[key] + + def __iter__(self): + "Iterating over a tag iterates over its contents." + return iter(self.contents) + + def __len__(self): + "The length of a tag is the length of its list of contents." + return len(self.contents) + + def __contains__(self, x): + return x in self.contents + + def __nonzero__(self): + "A tag is non-None even if it has no contents." + return True + + def __setitem__(self, key, value): + """Setting tag[key] sets the value of the 'key' attribute for the + tag.""" + self.attrs[key] = value + + def __delitem__(self, key): + "Deleting tag[key] deletes all 'key' attributes for the tag." + self.attrs.pop(key, None) + + def __call__(self, *args, **kwargs): + """Calling a tag like a function is the same as calling its + find_all() method. Eg. tag('a') returns a list of all the A tags + found within this tag.""" + return self.find_all(*args, **kwargs) + + def __getattr__(self, tag): + #print "Getattr %s.%s" % (self.__class__, tag) + if len(tag) > 3 and tag.endswith('Tag'): + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( + name=tag_name + ) + ) + return self.find(tag_name) + # We special case contents to avoid recursion. + elif not tag.startswith("__") and not tag == "contents": + return self.find(tag) + raise AttributeError( + "'%s' object has no attribute '%s'" % (self.__class__, tag)) + + def __eq__(self, other): + """Returns true iff this tag has the same name, the same attributes, + and the same contents (recursively) as the given tag.""" + if self is other: + return True + if (not hasattr(other, 'name') or + not hasattr(other, 'attrs') or + not hasattr(other, 'contents') or + self.name != other.name or + self.attrs != other.attrs or + len(self) != len(other)): + return False + for i, my_child in enumerate(self.contents): + if my_child != other.contents[i]: + return False + return True + + def __ne__(self, other): + """Returns true iff this tag is not identical to the other tag, + as defined in __eq__.""" + return not self == other + + def __repr__(self, encoding="unicode-escape"): + """Renders this tag as a string.""" + if PY3K: + # "The return value must be a string object", i.e. Unicode + return self.decode() + else: + # "The return value must be a string object", i.e. a bytestring. + # By convention, the return value of __repr__ should also be + # an ASCII string. + return self.encode(encoding) + + def __unicode__(self): + return self.decode() + + def __str__(self): + if PY3K: + return self.decode() + else: + return self.encode() + + if PY3K: + __str__ = __repr__ = __unicode__ + + def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, + indent_level=None, formatter="minimal", + errors="xmlcharrefreplace"): + # Turn the data structure into Unicode, then encode the + # Unicode. + u = self.decode(indent_level, encoding, formatter) + return u.encode(encoding, errors) + + def decode(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a Unicode representation of this tag and its contents. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. This method is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + """ + + # First off, turn a non-Formatter `formatter` into a Formatter + # object. This will stop the lookup from happening over and + # over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + attributes = formatter.attributes(self) + attrs = [] + for key, val in attributes: + if val is None: + decoded = key + else: + if isinstance(val, list) or isinstance(val, tuple): + val = ' '.join(val) + elif not isinstance(val, basestring): + val = unicode(val) + elif ( + isinstance(val, AttributeValueWithCharsetSubstitution) + and eventual_encoding is not None + ): + val = val.encode(eventual_encoding) + + text = formatter.attribute_value(val) + decoded = ( + unicode(key) + '=' + + formatter.quoted_attribute_value(text)) + attrs.append(decoded) + close = '' + closeTag = '' + + prefix = '' + if self.prefix: + prefix = self.prefix + ":" + + if self.is_empty_element: + close = formatter.void_element_close_prefix or '' + else: + closeTag = '' % (prefix, self.name) + + pretty_print = self._should_pretty_print(indent_level) + space = '' + indent_space = '' + if indent_level is not None: + indent_space = (' ' * (indent_level - 1)) + if pretty_print: + space = indent_space + indent_contents = indent_level + 1 + else: + indent_contents = None + contents = self.decode_contents( + indent_contents, eventual_encoding, formatter + ) + + if self.hidden: + # This is the 'document root' object. + s = contents + else: + s = [] + attribute_string = '' + if attrs: + attribute_string = ' ' + ' '.join(attrs) + if indent_level is not None: + # Even if this particular tag is not pretty-printed, + # we should indent up to the start of the tag. + s.append(indent_space) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) + if pretty_print: + s.append("\n") + s.append(contents) + if pretty_print and contents and contents[-1] != "\n": + s.append("\n") + if pretty_print and closeTag: + s.append(space) + s.append(closeTag) + if indent_level is not None and closeTag and self.next_sibling: + # Even if this particular tag is not pretty-printed, + # we're now done with the tag, and we should add a + # newline if appropriate. + s.append("\n") + s = ''.join(s) + return s + + def _should_pretty_print(self, indent_level): + """Should this tag be pretty-printed?""" + return ( + indent_level is not None + and self.name not in self.preserve_whitespace_tags + ) + + def prettify(self, encoding=None, formatter="minimal"): + if encoding is None: + return self.decode(True, formatter=formatter) + else: + return self.encode(encoding, True, formatter=formatter) + + def decode_contents(self, indent_level=None, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a Unicode string. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The tag is destined to be + encoded into this encoding. decode_contents() is _not_ + responsible for performing that encoding. This information + is passed in so that it can be substituted in if the + document contains a tag that mentions the document's + encoding. + + :param formatter: A Formatter object, or a string naming one of + the standard Formatters. + """ + # First off, turn a string formatter into a Formatter object. This + # will stop the lookup from happening over and over again. + if not isinstance(formatter, Formatter): + formatter = self.formatter_for_name(formatter) + + pretty_print = (indent_level is not None) + s = [] + for c in self: + text = None + if isinstance(c, NavigableString): + text = c.output_ready(formatter) + elif isinstance(c, Tag): + s.append(c.decode(indent_level, eventual_encoding, + formatter)) + preserve_whitespace = ( + self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags + ) + if text and indent_level and not preserve_whitespace: + text = text.strip() + if text: + if pretty_print and not preserve_whitespace: + s.append(" " * (indent_level - 1)) + s.append(text) + if pretty_print and not preserve_whitespace: + s.append("\n") + return ''.join(s) + + def encode_contents( + self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Renders the contents of this tag as a bytestring. + + :param indent_level: Each line of the rendering will be + indented this many spaces. + + :param eventual_encoding: The bytestring will be in this encoding. + + :param formatter: The output formatter responsible for converting + entities to Unicode characters. + """ + + contents = self.decode_contents(indent_level, encoding, formatter) + return contents.encode(encoding) + + # Old method for BS3 compatibility + def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, + prettyPrint=False, indentLevel=0): + if not prettyPrint: + indentLevel = None + return self.encode_contents( + indent_level=indentLevel, encoding=encoding) + + #Soup methods + + def find(self, name=None, attrs={}, recursive=True, text=None, + **kwargs): + """Return only the first child of this Tag matching the given + criteria.""" + r = None + l = self.find_all(name, attrs, recursive, text, 1, **kwargs) + if l: + r = l[0] + return r + findChild = find + + def find_all(self, name=None, attrs={}, recursive=True, text=None, + limit=None, **kwargs): + """Extracts a list of Tag objects that match the given + criteria. You can specify the name of the Tag and any + attributes you want the Tag to have. + + The value of a key-value pair in the 'attrs' map can be a + string, a list of strings, a regular expression object, or a + callable that takes a string and returns whether or not the + string matches for some custom definition of 'matches'. The + same is true of the tag name.""" + + generator = self.descendants + if not recursive: + generator = self.children + return self._find_all(name, attrs, text, limit, generator, **kwargs) + findAll = find_all # BS3 + findChildren = find_all # BS2 + + #Generator methods + @property + def children(self): + # return iter() to make the purpose of the method clear + return iter(self.contents) # XXX This seems to be untested. + + @property + def descendants(self): + if not len(self.contents): + return + stopNode = self._last_descendant().next_element + current = self.contents[0] + while current is not stopNode: + yield current + current = current.next_element + + # CSS selector code + def select_one(self, selector, namespaces=None, **kwargs): + """Perform a CSS selection operation on the current element.""" + value = self.select(selector, namespaces, 1, **kwargs) + if value: + return value[0] + return None + + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. + + This uses the SoupSieve library. + + :param selector: A string containing a CSS selector. + + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. + + :param limit: After finding this number of results, stop looking. + + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + return soupsieve.select(selector, self, namespaces, limit, **kwargs) + + # Old names for backwards compatibility + def childGenerator(self): + return self.children + + def recursiveChildGenerator(self): + return self.descendants + + def has_key(self, key): + """This was kind of misleading because has_key() (attributes) + was different from __in__ (contents). has_key() is gone in + Python 3, anyway.""" + warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( + key)) + return self.has_attr(key) + +# Next, a couple classes to represent queries and their results. +class SoupStrainer(object): + """Encapsulates a number of ways of matching a markup element (tag or + text).""" + + def __init__(self, name=None, attrs={}, text=None, **kwargs): + self.name = self._normalize_search_value(name) + if not isinstance(attrs, dict): + # Treat a non-dict value for attrs as a search for the 'class' + # attribute. + kwargs['class'] = attrs + attrs = None + + if 'class_' in kwargs: + # Treat class_="foo" as a search for the 'class' + # attribute, overriding any non-dict value for attrs. + kwargs['class'] = kwargs['class_'] + del kwargs['class_'] + + if kwargs: + if attrs: + attrs = attrs.copy() + attrs.update(kwargs) + else: + attrs = kwargs + normalized_attrs = {} + for key, value in attrs.items(): + normalized_attrs[key] = self._normalize_search_value(value) + + self.attrs = normalized_attrs + self.text = self._normalize_search_value(text) + + def _normalize_search_value(self, value): + # Leave it alone if it's a Unicode string, a callable, a + # regular expression, a boolean, or None. + if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') + or isinstance(value, bool) or value is None): + return value + + # If it's a bytestring, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # If it's listlike, convert it into a list of strings. + if hasattr(value, '__iter__'): + new_value = [] + for v in value: + if (hasattr(v, '__iter__') and not isinstance(v, bytes) + and not isinstance(v, unicode)): + # This is almost certainly the user's mistake. In the + # interests of avoiding infinite loops, we'll let + # it through as-is rather than doing a recursive call. + new_value.append(v) + else: + new_value.append(self._normalize_search_value(v)) + return new_value + + # Otherwise, convert it into a Unicode string. + # The unicode(str()) thing is so this will do the same thing on Python 2 + # and Python 3. + return unicode(str(value)) + + def __str__(self): + if self.text: + return self.text + else: + return "%s|%s" % (self.name, self.attrs) + + def search_tag(self, markup_name=None, markup_attrs={}): + found = None + markup = None + if isinstance(markup_name, Tag): + markup = markup_name + markup_attrs = markup + call_function_with_tag_data = ( + isinstance(self.name, Callable) + and not isinstance(markup_name, Tag)) + + if ((not self.name) + or call_function_with_tag_data + or (markup and self._matches(markup, self.name)) + or (not markup and self._matches(markup_name, self.name))): + if call_function_with_tag_data: + match = self.name(markup_name, markup_attrs) + else: + match = True + markup_attr_map = None + for attr, match_against in list(self.attrs.items()): + if not markup_attr_map: + if hasattr(markup_attrs, 'get'): + markup_attr_map = markup_attrs + else: + markup_attr_map = {} + for k, v in markup_attrs: + markup_attr_map[k] = v + attr_value = markup_attr_map.get(attr) + if not self._matches(attr_value, match_against): + match = False + break + if match: + if markup: + found = markup + else: + found = markup_name + if found and self.text and not self._matches(found.string, self.text): + found = None + return found + searchTag = search_tag + + def search(self, markup): + # print 'looking for %s in %s' % (self, markup) + found = None + # If given a list of items, scan it for a text element that + # matches. + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + for element in markup: + if isinstance(element, NavigableString) \ + and self.search(element): + found = element + break + # If it's a Tag, make sure its name or attributes match. + # Don't bother with Tags if we're searching for text. + elif isinstance(markup, Tag): + if not self.text or self.name or self.attrs: + found = self.search_tag(markup) + # If it's text, make sure the text matches. + elif isinstance(markup, NavigableString) or \ + isinstance(markup, basestring): + if not self.name and not self.attrs and self._matches(markup, self.text): + found = markup + else: + raise Exception( + "I don't know how to match against a %s" % markup.__class__) + return found + + def _matches(self, markup, match_against, already_tried=None): + # print u"Matching %s against %s" % (markup, match_against) + result = False + if isinstance(markup, list) or isinstance(markup, tuple): + # This should only happen when searching a multi-valued attribute + # like 'class'. + for item in markup: + if self._matches(item, match_against): + return True + # We didn't match any particular value of the multivalue + # attribute, but maybe we match the attribute value when + # considered as a string. + if self._matches(' '.join(markup), match_against): + return True + return False + + if match_against is True: + # True matches any non-None value. + return markup is not None + + if isinstance(match_against, Callable): + return match_against(markup) + + # Custom callables take the tag as an argument, but all + # other ways of matching match the tag name as a string. + original_markup = markup + if isinstance(markup, Tag): + markup = markup.name + + # Ensure that `markup` is either a Unicode string, or None. + markup = self._normalize_search_value(markup) + + if markup is None: + # None matches None, False, an empty string, an empty list, and so on. + return not match_against + + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, basestring)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, unicode): + # Exact string match + match = markup == match_against + + if not match and hasattr(match_against, 'search'): + # Regexp match + return match_against.search(markup) + + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match + + +class ResultSet(list): + """A ResultSet is just a list that keeps track of the SoupStrainer + that created it.""" + def __init__(self, source, result=()): + super(ResultSet, self).__init__(result) + self.source = source + + def __getattr__(self, key): + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + ) diff --git a/bs4/formatter.py b/bs4/formatter.py new file mode 100644 index 0000000..7dbaa38 --- /dev/null +++ b/bs4/formatter.py @@ -0,0 +1,99 @@ +from bs4.dammit import EntitySubstitution + +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. + + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. + """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} + + HTML = 'html' + XML = 'xml' + + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + ) + + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + ): + """ + + :param void_element_close_prefix: By default, represent void + elements as rather than + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution.""" + if not self.entity_substitution: + return ns + from .element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return self.entity_substitution(ns) + + def attribute_value(self, value): + """Process the value of an attribute.""" + return self.substitute(value) + + def attributes(self, tag): + """Reorder a tag's attributes however you want.""" + return sorted(tag.attrs.items()) + + +class HTMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + + +class XMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) diff --git a/bs4/formatter.py.bak b/bs4/formatter.py.bak new file mode 100644 index 0000000..f2724db --- /dev/null +++ b/bs4/formatter.py.bak @@ -0,0 +1,99 @@ +from bs4.dammit import EntitySubstitution + +class Formatter(EntitySubstitution): + """Describes a strategy to use when outputting a parse tree to a string. + + Some parts of this strategy come from the distinction between + HTML4, HTML5, and XML. Others are configurable by the user. + """ + # Registries of XML and HTML formatters. + XML_FORMATTERS = {} + HTML_FORMATTERS = {} + + HTML = 'html' + XML = 'xml' + + HTML_DEFAULTS = dict( + cdata_containing_tags=set(["script", "style"]), + ) + + def _default(self, language, value, kwarg): + if value is not None: + return value + if language == self.XML: + return set() + return self.HTML_DEFAULTS[kwarg] + + def __init__( + self, language=None, entity_substitution=None, + void_element_close_prefix='/', cdata_containing_tags=None, + ): + """ + + :param void_element_close_prefix: By default, represent void + elements as rather than + """ + self.language = language + self.entity_substitution = entity_substitution + self.void_element_close_prefix = void_element_close_prefix + self.cdata_containing_tags = self._default( + language, cdata_containing_tags, 'cdata_containing_tags' + ) + + def substitute(self, ns): + """Process a string that needs to undergo entity substitution.""" + if not self.entity_substitution: + return ns + from element import NavigableString + if (isinstance(ns, NavigableString) + and ns.parent is not None + and ns.parent.name in self.cdata_containing_tags): + # Do nothing. + return ns + # Substitute. + return self.entity_substitution(ns) + + def attribute_value(self, value): + """Process the value of an attribute.""" + return self.substitute(value) + + def attributes(self, tag): + """Reorder a tag's attributes however you want.""" + return sorted(tag.attrs.items()) + + +class HTMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) + + +class XMLFormatter(Formatter): + REGISTRY = {} + def __init__(self, *args, **kwargs): + return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) + + +# Set up aliases for the default formatters. +HTMLFormatter.REGISTRY['html'] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_html, + void_element_close_prefix = None +) +HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +HTMLFormatter.REGISTRY[None] = HTMLFormatter( + entity_substitution=None +) +XMLFormatter.REGISTRY["html"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_html +) +XMLFormatter.REGISTRY["minimal"] = XMLFormatter( + entity_substitution=EntitySubstitution.substitute_xml +) +XMLFormatter.REGISTRY[None] = Formatter( + Formatter(Formatter.XML, entity_substitution=None) +) diff --git a/bs4/testing.py b/bs4/testing.py new file mode 100644 index 0000000..cc99666 --- /dev/null +++ b/bs4/testing.py @@ -0,0 +1,992 @@ +# encoding: utf-8 +"""Helper classes for tests.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import pickle +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + SoupStrainer, + Tag +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + +BAD_DOCUMENT = """A bare string + + +
+
HTML5 does allow CDATA sections in SVG
+
A tag
+
A
tag that supposedly has contents.
+
AT&T
+
+
+
This numeric entity is missing the final semicolon:
+ +
a
+
This document contains (do you see it?)
+
This document ends with That attribute value was bogus
+The doctype is invalid because it contains extra whitespace +
That boolean attribute had no value
+
Here's a nonexistent entity: &#foo; (do you see it?)
+
This document ends before the entity finishes: > +

Paragraphs shouldn't contain block display elements, but this one does:

you see?

+Multiple values for the same attribute. +
Here's a table
+
+
This tag contains nothing but whitespace:
+

This p tag is cut off by

the end of the blockquote tag
+
Here's a nested table:
foo
This table contains bare markup
+ +
This document contains a surprise doctype
+ +
Tag name contains Unicode characters
+ + +""" + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup, **kwargs): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder(**kwargs).test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e + + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment): + """Generate and parse a document with the given doctype.""" + doctype = '' % doctype_fragment + markup = doctype + '\n

foo

' + soup = self.soup(markup) + return doctype, soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""content""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + + def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = """""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + + markup = b"""""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A

tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("

") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "

") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("

", "

") + self.assertSoupEquals("", "") + + self.assertSoupEquals("
", "
") + + def test_br_is_always_empty_element_tag(self): + """A
tag is designated as an empty-element tag. + + Some parsers treat

as one
tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("

") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "
") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("") + + def test_double_head(self): + html = ''' + + +Ordinary HEAD element test + + + +Hello, world! + + +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "

foobaz

" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in
 and "
+        self.assertSoupEquals(pre_markup)
+        self.assertSoupEquals(textarea_markup)
+
+        soup = self.soup(pre_markup)
+        self.assertEqual(soup.pre.prettify(), pre_markup)
+
+        soup = self.soup(textarea_markup)
+        self.assertEqual(soup.textarea.prettify(), textarea_markup)
+
+        soup = self.soup("")
+        self.assertEqual(soup.textarea.prettify(), "")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "Inside a B tag"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "

A nested tag

" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "

A doubly nested tag

" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('

Foo

') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '
' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '
' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # tag. This has caused problems with multivalued + # attributes. + markup = '' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('', '') + + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + "

• AT&T is in the s&p 500

", + "

\u2022 AT&T is in the s&p 500

" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + "

Bob's Bar

", + "

Bob's Bar

", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "

“Hello” -☃

" + soup = self.soup(markup) + self.assertEqual("“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): + expect = '

' + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("

I said "good day!"

", + '

I said "good day!"

') + + def test_out_of_range_entity(self): + expect = "\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("

\nfoo

") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('


', "


") + self.assertSoupEquals('


', "


") + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + foo + +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + + + + +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'4' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A bold statement", + parse_only=strainer) + self.assertEqual(soup.decode(), "bold") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("", + '') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """a""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """a""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """a""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('', + '') + + self.assertSoupEquals( + 'foo', + 'foo') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'
'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'
') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("text") + data.a['foo'] = 'bar' + self.assertEqual('text', data.a.decode()) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class XMLTreeBuilderSmokeTest(object): + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def test_docstring_generated(self): + soup = self.soup("") + self.assertEqual( + soup.encode(), b'\n') + + def test_xml_declaration(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_processing_instruction(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_nested_namespaces(self): + doc = b""" + + + + + +""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = 'b2012-07-02T20:33:42Zcd' + soup = self.soup(markup) + self.assertEqual( + str(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") + self.assertEqual( + soup.encode("latin1"), + b'\n') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("

", "

") + self.assertSoupEquals("

foo

") + + def test_namespaces_are_preserved(self): + markup = 'This tag is in the a namespaceThis tag is in the b namespace' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '

20010504

' + soup = self.soup(markup) + self.assertEqual(str(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = 'bar' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_find_by_prefixed_name(self): + doc = """ +foo + bar + baz + +""" + soup = self.soup(doc) + + # There are three tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + def test_copy_tag_preserves_namespace(self): + xml = """ +""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + self.assertEqual(tag.prefix, duplicate.prefix) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '5' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/bs4/testing.py.bak b/bs4/testing.py.bak new file mode 100644 index 0000000..9f12e8d --- /dev/null +++ b/bs4/testing.py.bak @@ -0,0 +1,992 @@ +# encoding: utf-8 +"""Helper classes for tests.""" + +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + +import pickle +import copy +import functools +import unittest +from unittest import TestCase +from bs4 import BeautifulSoup +from bs4.element import ( + CharsetMetaAttributeValue, + Comment, + ContentMetaAttributeValue, + Doctype, + SoupStrainer, + Tag +) + +from bs4.builder import HTMLParserTreeBuilder +default_builder = HTMLParserTreeBuilder + +BAD_DOCUMENT = u"""A bare string + + +
+
HTML5 does allow CDATA sections in SVG
+
A tag
+
A
tag that supposedly has contents.
+
AT&T
+
+
+
This numeric entity is missing the final semicolon:
+
+
a
+
This document contains (do you see it?)
+
This document ends with That attribute value was bogus
+The doctype is invalid because it contains extra whitespace +
That boolean attribute had no value
+
Here's a nonexistent entity: &#foo; (do you see it?)
+
This document ends before the entity finishes: > +

Paragraphs shouldn't contain block display elements, but this one does:

you see?

+Multiple values for the same attribute. +
Here's a table
+
+
This tag contains nothing but whitespace:
+

This p tag is cut off by

the end of the blockquote tag
+
Here's a nested table:
foo
This table contains bare markup
+ +
This document contains a surprise doctype
+ +
Tag name contains Unicode characters
+ + +""" + + +class SoupTest(unittest.TestCase): + + @property + def default_builder(self): + return default_builder + + def soup(self, markup, **kwargs): + """Build a Beautiful Soup object from markup.""" + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) + + def document_for(self, markup, **kwargs): + """Turn an HTML fragment into a document. + + The details depend on the builder. + """ + return self.default_builder(**kwargs).test_fragment_to_document(markup) + + def assertSoupEquals(self, to_parse, compare_parsed_to=None): + builder = self.default_builder + obj = BeautifulSoup(to_parse, builder=builder) + if compare_parsed_to is None: + compare_parsed_to = to_parse + + self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) + + def assertConnectedness(self, element): + """Ensure that next_element and previous_element are properly + set for all descendants of the given element. + """ + earlier = None + for e in element.descendants: + if earlier: + self.assertEqual(e, earlier.next_element) + self.assertEqual(earlier, e.previous_element) + earlier = e + + def linkage_validator(self, el, _recursive_call=False): + """Ensure proper linkage throughout the document.""" + descendant = None + # Document element should have no previous element or previous sibling. + # It also shouldn't have a next sibling. + if el.parent is None: + assert el.previous_element is None,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_element, None + ) + assert el.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + el, el.previous_sibling, None + ) + assert el.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_sibling, None + ) + + idx = 0 + child = None + last_child = None + last_idx = len(el.contents) - 1 + for child in el.contents: + descendant = None + + # Parent should link next element to their first child + # That child should have no previous sibling + if idx == 0: + if el.parent is not None: + assert el.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( + el, el.next_element, child + ) + assert child.previous_element is el,\ + "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( + child, child.previous_element, el + ) + assert child.previous_sibling is None,\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( + child, child.previous_sibling, None + ) + + # If not the first child, previous index should link as sibling to this index + # Previous element should match the last index or the last bubbled up descendant + else: + assert child.previous_sibling is el.contents[idx - 1],\ + "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( + child, child.previous_sibling, el.contents[idx - 1] + ) + assert el.contents[idx - 1].next_sibling is child,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + el.contents[idx - 1], el.contents[idx - 1].next_sibling, child + ) + + if last_child is not None: + assert child.previous_element is last_child,\ + "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( + child, child.previous_element, last_child, child.parent.contents + ) + assert last_child.next_element is child,\ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + last_child, last_child.next_element, child + ) + + if isinstance(child, Tag) and child.contents: + descendant = self.linkage_validator(child, True) + # A bubbled up descendant should have no next siblings + assert descendant.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + descendant, descendant.next_sibling, None + ) + + # Mark last child as either the bubbled up descendant or the current child + if descendant is not None: + last_child = descendant + else: + last_child = child + + # If last child, there are non next siblings + if idx == last_idx: + assert child.next_sibling is None,\ + "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_sibling, None + ) + idx += 1 + + child = descendant if descendant is not None else child + if child is None: + child = el + + if not _recursive_call and child is not None: + target = el + while True: + if target is None: + assert child.next_element is None, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, None + ) + break + elif target.next_sibling is not None: + assert child.next_element is target.next_sibling, \ + "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( + child, child.next_element, target.next_sibling + ) + break + target = target.parent + + # We are done, so nothing to return + return None + else: + # Return the child to the recursive caller + return child + + +class HTMLTreeBuilderSmokeTest(object): + + """A basic test of a treebuilder's competence. + + Any HTML treebuilder, present or future, should be able to pass + these tests. With invalid markup, there's room for interpretation, + and different parsers can handle it differently. But with the + markup in these tests, there's not much room for interpretation. + """ + + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def assertDoctypeHandled(self, doctype_fragment): + """Assert that a given doctype string is handled correctly.""" + doctype_str, soup = self._document_with_doctype(doctype_fragment) + + # Make sure a Doctype object was created. + doctype = soup.contents[0] + self.assertEqual(doctype.__class__, Doctype) + self.assertEqual(doctype, doctype_fragment) + self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) + + # Make sure that the doctype was correctly associated with the + # parse tree and that the rest of the document parsed. + self.assertEqual(soup.p.contents[0], 'foo') + + def _document_with_doctype(self, doctype_fragment): + """Generate and parse a document with the given doctype.""" + doctype = '' % doctype_fragment + markup = doctype + '\n

foo

' + soup = self.soup(markup) + return doctype, soup + + def test_normal_doctypes(self): + """Make sure normal, everyday HTML doctypes are handled correctly.""" + self.assertDoctypeHandled("html") + self.assertDoctypeHandled( + 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') + + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_public_doctype_with_url(self): + doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' + self.assertDoctypeHandled(doctype) + + def test_system_doctype(self): + self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') + + def test_namespaced_system_doctype(self): + # We can handle a namespaced doctype with a system ID. + self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') + + def test_namespaced_public_doctype(self): + # Test a namespaced doctype with a public id. + self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') + + def test_real_xhtml_document(self): + """A real XHTML document should come out more or less the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8").replace(b"\n", b""), + markup.replace(b"\n", b"")) + + def test_namespaced_html(self): + """When a namespaced XML document is parsed as HTML it should + be treated as HTML with weird tag names. + """ + markup = b"""content""" + soup = self.soup(markup) + self.assertEqual(2, len(soup.find_all("ns1:foo"))) + + def test_processing_instruction(self): + # We test both Unicode and bytestring to verify that + # process_markup correctly sets processing_instruction_class + # even when the markup is already Unicode and there is no + # need to process anything. + markup = u"""""" + soup = self.soup(markup) + self.assertEqual(markup, soup.decode()) + + markup = b"""""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_deepcopy(self): + """Make sure you can copy the tree builder. + + This is important because the builder is part of a + BeautifulSoup object, and we want to be able to copy that. + """ + copy.deepcopy(self.default_builder) + + def test_p_tag_is_never_empty_element(self): + """A

tag is never designated as an empty-element tag. + + Even if the markup shows it as an empty-element tag, it + shouldn't be presented that way. + """ + soup = self.soup("

") + self.assertFalse(soup.p.is_empty_element) + self.assertEqual(str(soup.p), "

") + + def test_unclosed_tags_get_closed(self): + """A tag that's not closed by the end of the document should be closed. + + This applies to all tags except empty-element tags. + """ + self.assertSoupEquals("

", "

") + self.assertSoupEquals("", "") + + self.assertSoupEquals("
", "
") + + def test_br_is_always_empty_element_tag(self): + """A
tag is designated as an empty-element tag. + + Some parsers treat

as one
tag, some parsers as + two tags, but it should always be an empty-element tag. + """ + soup = self.soup("

") + self.assertTrue(soup.br.is_empty_element) + self.assertEqual(str(soup.br), "
") + + def test_nested_formatting_elements(self): + self.assertSoupEquals("") + + def test_double_head(self): + html = ''' + + +Ordinary HEAD element test + + + +Hello, world! + + +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "

foobaz

" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in
 and "
+        self.assertSoupEquals(pre_markup)
+        self.assertSoupEquals(textarea_markup)
+
+        soup = self.soup(pre_markup)
+        self.assertEqual(soup.pre.prettify(), pre_markup)
+
+        soup = self.soup(textarea_markup)
+        self.assertEqual(soup.textarea.prettify(), textarea_markup)
+
+        soup = self.soup("")
+        self.assertEqual(soup.textarea.prettify(), "")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "Inside a B tag"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "

A nested tag

" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "

A doubly nested tag

" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('

Foo

') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_multivalued_attribute_with_whitespace(self): + # Whitespace separating the values of a multi-valued attribute + # should be ignored. + + markup = '
' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.div['class']) + + # If you search by the literal name of the class it's like the whitespace + # wasn't there. + self.assertEqual(soup.div, soup.find('div', class_="foo bar")) + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '
' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # tag. This has caused problems with multivalued + # attributes. + markup = '' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('', '') + + def test_strings_resembling_character_entity_references(self): + # "&T" and "&p" look like incomplete character entities, but they are + # not. + self.assertSoupEquals( + u"

• AT&T is in the s&p 500

", + u"

\u2022 AT&T is in the s&p 500

" + ) + + def test_apos_entity(self): + self.assertSoupEquals( + u"

Bob's Bar

", + u"

Bob's Bar

", + ) + + def test_entities_in_foreign_document_encoding(self): + # “ and ” are invalid numeric entities referencing + # Windows-1252 characters. - references a character common + # to Windows-1252 and Unicode, and ☃ references a + # character only found in Unicode. + # + # All of these entities should be converted to Unicode + # characters. + markup = "

“Hello” -☃

" + soup = self.soup(markup) + self.assertEquals(u"“Hello” -☃", soup.p.string) + + def test_entities_in_attributes_converted_to_unicode(self): + expect = u'

' + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("

I said "good day!"

", + '

I said "good day!"

') + + def test_out_of_range_entity(self): + expect = u"\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("

\nfoo

") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('


', "


") + self.assertSoupEquals('


', "


") + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + foo + +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + + + + +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'4' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A bold statement", + parse_only=strainer) + self.assertEqual(soup.decode(), "bold") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("", + '') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """a""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """a""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """a""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('', + '') + + self.assertSoupEquals( + 'foo', + 'foo') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'
'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'
') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("text") + data.a['foo'] = 'bar' + self.assertEqual('text', data.a.decode()) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class XMLTreeBuilderSmokeTest(object): + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def test_docstring_generated(self): + soup = self.soup("") + self.assertEqual( + soup.encode(), b'\n') + + def test_xml_declaration(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_processing_instruction(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_nested_namespaces(self): + doc = b""" + + + + + +""" + soup = self.soup(doc) + self.assertEqual(doc, soup.encode()) + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = 'b2012-07-02T20:33:42Zcd' + soup = self.soup(markup) + self.assertEqual( + unicode(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") + self.assertEqual( + soup.encode("latin1"), + b'\n') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("

", "

") + self.assertSoupEquals("

foo

") + + def test_namespaces_are_preserved(self): + markup = 'This tag is in the a namespaceThis tag is in the b namespace' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '

20010504

' + soup = self.soup(markup) + self.assertEqual(unicode(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = 'bar' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + + def test_find_by_prefixed_name(self): + doc = """ +foo + bar + baz + +""" + soup = self.soup(doc) + + # There are three tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + def test_copy_tag_preserves_namespace(self): + xml = """ +""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + self.assertEqual(tag.prefix, duplicate.prefix) + + def test_worst_case(self): + """Test the worst case (currently) for linking issues.""" + + soup = self.soup(BAD_DOCUMENT) + self.linkage_validator(soup) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '5' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py new file mode 100644 index 0000000..142c8cc --- /dev/null +++ b/bs4/tests/__init__.py @@ -0,0 +1 @@ +"The beautifulsoup tests." diff --git a/bs4/tests/__pycache__/__init__.cpython-36.pyc b/bs4/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a2e5cec890fc86416bfd98602ada5568c435048a GIT binary patch literal 227 zcmXr!<>hMl^*UCXfq~&M5W@i@kmUfx#T-B)g&~R|g)x{xlc`D~BqLQJDK)XQBr~lv zr#Qc~K%peHxTILmPm}Q$dwhIKesX;LN`@jPpf)h^D@Q*gKQ~oBGcix!B|o_|H#M)M zSl=bJEHx*;0L0M`Ey>I&)^|vG%16~+P}3GpUbk{dVEX4E82OE(f7U)m;hQ`e5{y5n$MTS**DItYkeN{~Q+ zUVv7_k*A$u{$|?gTl5|J41ECn?XOPXAiwgR#YH3|CU&X8*~Qt#?w;>lSJzst`d`2P z-=F_|QPcjT&HOgezlA6M7YNsEjWezX%x0?9Z5^!<7(>%G8I74iWmvUqL(8^?EA|T0 zPBy`@qI zlwYi7U)Kat!S{79Eh#{*blgn zTT<-%aUxH4{tF4UUWcWv+rk|u{$LzDj>e;ok*?))onmf!@eg|+K6vQLSjaso#O-XY zbYte_QNAt~T%*}HX2$mJcJ%b(gC`$8m|@SmCJV=ItT^|WK^yAonCQ{>zVUSj#THi}j zi-vCme!sJlHVI+p;iKKhyS=@%;l|Knvio%G2B}qQxEe&!(Rd^+NFi%PNU&U$5ANSC zzL!<7p<^=^!Qh{ogac_`pL7qRq3HT<*!>{##zPS%arXo9Oa#%0Ub-J8eh_ze``Baq z;kY08@qv)rd}hYNSa)=C?Wo(2uXhtNVnv-rv)mb-q-!+pc+s%$hi>9W;T8@Q{{Tc| zCTp{*t|;}?`{d1YOk!tokvR<;SW9X65FKdpa;Y6QWto0Ujt-8}Ydh0MrkxXePBZNr$s**F) zw9`LvhOUIpo#bRBb}$~dK?t^WwyCe9-P9*LOW`_qnK|G}*LZ?u0o>bXf}Xj?49_Us zBB{#TCEqN7DmfK!&S0wcY%_rEE1WXKNRCN^TwrIUg4PoFl$|RKUkJW+1z+e61$W?D z(v>IgVyBp(&aExCvnK4W$tG_t1#fPV5;O{TEj-HF{)kS3SVD2+EB3e7pChVp#13Bh ziXuwZukP#F`zMGGf6^XlPbj7siK)hG7^6Hzk;Cfk!ELKjc#-6(rho#&8VENTOA*UD z_M5DR(RMuc4z_c4r;=L7t_=NfKTg}8n|KFK@h<7OX)}u_k6b?%aa#Y_4aQ4cyNf&HTIH7Iy@4wlbP3DY!TDRG`(GE>aUs>o4mgi)fZZ_ z*RfK|aYzG>lhz$)81ZpH{ifqkRLnH;~;d-sCN`E4I}Lb}0xlA{wVqoi1rXj2%OgItVVSPy!xfUBa? zlDJceMe@<;Q_PBKC5=tii;d|}J8HP}0s}K>#d~<-w?PtBg&nd}#LQP<`2*Bor_5(l zAY?`=2vei) z%I}m{zJqCtvR*moyoDB#5|7H@;~)}xLhula!e>e(+s^dxWq*;# zW@0?)3<%FUG+pJV`i#ZSMmUB062vplmcPQ5lG4c{a_`L|H)DgI8-|hc>nQYmRDM!C z9it3ERbK?|zY}g*zH3srySEgnms_{jX~{R}s1)wyPl>!m# z9+BS=xk==_+Vf`^_!y5;wyHf1tA^^cVwrdh>w=^ioei~}ZH+@eA#^0wtG0oAwkqcB z26{V1i9ou{?wX|0tjHrwBwd!$#{D5#oGSIRe;BBX6x~u&YY%JRaq)i!s)^Os=bbvW lZf3Rj`xMOLEf5sV%tG?siQ5yh_^#Q;q)CxfxzMn0aRIPQCrA*4!lgL_Btd!Pd7dInHV-<@u z;-r+m^}k@!ZWJ~lx{Qzls@;}hmym?1XvHHbwW!^QbDdNp#ol67*nd3>Q%LOOX{-+I z3dbLhgIO4a_;fOz-}T1<_Q}I-u<*lKf9T`J+@DS73o`M7(FnWVbf}#L8IAil;U~yy zdYBUS30jMUp*TvBCsujFQYDecUKtnL!vo!`m`mg@h1hgv`I5(S&9V+Xs`%%j?XI^$ oOL>9H#Y;S_EW6uX;anzy=A2y_dIH(h03kH1mSgB^Ik0a00-3Rd(f|Me literal 0 HcmV?d00001 diff --git a/bs4/tests/__pycache__/test_htmlparser.cpython-36.pyc b/bs4/tests/__pycache__/test_htmlparser.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33a17c1345afac68e0c1f840f6c80162f149fd77 GIT binary patch literal 2493 zcmbVO-EP}96c+Vk%d*o3!Pa!^&}t~K&BJP8=r9bJ7X)34_F^*%B;AF8L7+uCQKG+* z)Cvm;Fu;9?y~18ex4SC#2EE!jl;b!}+69xrqoYIe@cey;ez>vW{qg3XU;peF#^1); z;{d-4L(YIOBQzLcW=b+MGzs!nYK0cO?bObk(8)SsN6VeGo4KK@`A*u)ywD@YQ-gJx z`^sQ0cX(%EhkcOrmQ7?BNMu9Ji;F9LnH>x-E;kGR;=ud38=>h-#C7RA#EVcfc<3h z7rJjtp3Z~|Qcd2-;IPPekkCAMT*Otzb0vev{CA!fC4K}iRFcZz$+RerA3U$7X(A7~ zc)-?Hyfqh;^M}X5RDKgc20$6o2y@k@YE;gfyIPjhjLQ-d!6G@Aif0ikVl^-M4Ky61 zW|;L4?{|9ieog-w6CH#h5xuj3w4w3f^RksCx*wlexdm(fuay@yt*pdkAja;az@^RT-Fabsg83~im6 z&_?3{AfSP>u!AZ%JFjA)^R=yOVYxp_WYiwaZh<)xE8eg@G9VtQzrE7=6_VPbx~yoG zCq$P8L+9xeVA?!kc1X(uqLJ6gb0+nj@l@=L10@ky`Hrd`>L@_Es+`eWMLa9jJmM)wG)Hk-K&%fUMGxdXFkJTs1ENwtyG~55qxR}3 zZN3#H9T{s#M1643{#7$Ero$u^SPNLH=(yt9Zd zOkl}M+~AtF>Dut?xwf8(LAZL*on06b)t+4*%ZtNmVcy)@2ZWycyH`!)6sl#F2-JZ? zmn-T~#=M5cCzz3NG`{l^@eea##=xDi7D1MxPaW8pUuG3;4TXG>+B@U)YeLb?7^HEruHC zUYmM6{YzNZ*09|n^`}<{qQ$e;Jb{k#5$q8kBe{bFVX$_B&KDjd5ZFT)x};pves470 z2yXx=3#(*PfpHahVTFCT^7Ki7FJYk=+j%QKs}fy1+qiHk8Li!nIs&=^^ifyY)VSle WDtGW)iRaqpfVgB6AUrU)?)?i0WSV;b literal 0 HcmV?d00001 diff --git a/bs4/tests/__pycache__/test_tree.cpython-36.pyc b/bs4/tests/__pycache__/test_tree.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50a6e1bf54d5c50fb7da6b265675bf56b1c818df GIT binary patch literal 91500 zcmdSC3w)f{bsva%VlV(f@F9wnC}{{jArXKGQg09@36d6V36x35vd5A~gSi7>$N>iW zz8O9yRGXMi?KF|&>p z(}y=r_y0fle)oIKHv>?Tn=Qz>-}`>wJ?GqW&OPVcb5CyDmjA}D{r+pW-j^8omji*n zH1^Nn=RTbr7*GSn0hLh6dSWeEOeVOWs;7!6*`|wW*=C9v*=CDbY}56@wOlbL_h#xt zYx!b+ZMZlr$JzSG+P31hwbA0}+Vmho;@-qSHGgfN%H74=uMZU;kfR}U^x%63R9+2#d_WCX@4uTW?#JL88}%KgW1d`KO}@!{$*+&_WiBkCxQkIM0*I6kH(a6BRHeGJEss>g8rm>eI+@o_bY z<4HN5#PO7x#__ZqPvQ7+bpppHJ%{7xre&<#J)8?6_42FF9XZUamUT^`<%H7H-z+^}<9JLP7rvECUx->5WiZ&W)&7aPu6 zx!J5bo$U+d8?}}4QoVY$=>Wu?5xyGxzyL0%7RxL6jvP-C*PX0;y|!^_xtO}ORxhUO zC@XA=_Xm!mdCPw6R0oO)@RuZbM)KnW;3+AU#x|`opr33f#|E&{O3FOWEr0X)xeuVY zGjL6~)5izyB+d`qP25S|C66NgrQ5&tV)=Tt;I@Qj%1t30gSzsHTUZ6<$+=3yajF$u zuik2cG79Ezakb_a@W15(DY6O{IrVrag)eq^w-jGnn_k6x zrfcQ(X%In+wBb&luimKE8yh^Bez{qzyVDnz8jb6dFSVBHH5Y_BsRDOIu1#;;K6!n5 z$vrhKfRhrGT&+%R-2PgA78NurjkS%r8Gbg~kR510>S27U?4p}CuL^YBso8p?Qm(sm z*zq1?cNo@pU3jBau1};oX#jJ%lO`%UBkpRWRaYfYab?wSr?Uqy>TCtEGj?mu-=}!&pGZYAHC2L9$F_Cn5mvbKrOM!wJ z=YE`f4nL0|CgbrzK*iI9vlmZKq?~=MoE*Y=kVW`4ov{!Bpte)EN!|$>*w^!&3lo`6 zu2fnF(=U}e`BG`Ep;~qBkCaLzLVd*{Z*v|FzgND1)aUF#F@c{;pyiUeAvsgQf&H^w z@@bs%xg zO~_Z6B*b~kX|z^W>$i*g`P~EvXW$x{S5hVKCays$2u}xnO{5pv zh1v4liP`D$-0afa%FFs5QK~Z);TQEl9dRYg#0T+0_$sSHAQE*WI-* z{g|`zr5|)>r*WpW4VOZvCla~^DMvn@gzg+c(XUB+54CN4j%p9+ z2?AG|-_H9dVnOi`s+!0MC^{)fHbJz&0}=r3><5(xLr{U9qs3%$?iKRj4Y$%Z2!pApVEKFo@h6 zYP=&49{A|0c7YYCjwY5S&{ow+SSNWZFrT zL(p+1anhMW(GM^W#{s599w-vvLwH>GA%f^=Ul93EEldnKkMQjU76(vt21_N?sFX_1 zQJy-+f+~qK&Efy|5 zJ*b(Ar;ShJ3&XUbxxMC(D?%Hj7;`pFt{FL-!I`Au71<*-c~3M`*U~Br+km;M23JIa zc-J8-BUjWA=rG?t`r$^az;T*eEjv)_DI=R4hjREzo&r&)s_s;a@_lx-xmKUMh_1K< zT9a=0vtI=3;@M6LXFPVDa300&ofK|uC#N3$WA@db{o*2Y3BS7vEq0PlwKGVaxK^+B zBh0-%VM5?)t}b8pI;xftz(6ve+>_XoOegjxcPB=YqX9k!~ifQ)Mqv?bM${|uFtl}r$IP;W-!uB&Z8&}jQ94?&`X2!7UEYfBj8 zL!+rRt7~q-sWw~AItFY@x2@(f)jkG>pp}L)l?Esep3+dBgVG?!PYAns9But-3I!C_ zjJ&3kIN^|)brL7P$|2WR^Kx*ilX$}6@dC6>5y+zTZN^k~g>ffu*g!4rC<>r0u|Kgl zkxuMSr4wx{m%C9W&xNpM8Sk^GCyFhqjT){Id;oLkLOFd_QNcXgIA3NV^JIQFTkuxd9niCQ7w+KnAprM=CAHg&F6Y2%;PcXfWUa`E>jBE+B~r zXWxH!UKtP5kKz-2m;!!OeiMK!bi^iLNyT2^%PZ-gSHNH1ig`*60tdPFVRR1L)4{zt zWYJ8ZP%d1lx~)2_2u+JApK1OJbAV3Qs482 zm4eKyBXvhHdVd(B`X<=$z7>_)KAumEp{I`}M-uIWy-{PI?baEb8C2&y!{Qu^^DN%W z;sq9Q`oc+Ecnv?TFC?HZjO9jhnOxrCl?s0Lj}qz(elCaU{^--Q(E;q~&I5t(7ENFQ z+L*fmJEd!hf%RCnLLDmLO{i3?Fg}?Z; z;WWv)L&soM?v0gamtn^}J?>Q1_?&YFZ*7kR?-BxN=e!$>Dcne2PXxOmnash8jJP^! zs+#?H*M1-COSeTk6i_JH{mZDwvaH>S_EWt<7ZA+d^^icrs-yD~KQRv59L0Q=?&W<1uZj*T0M|i4mvwYj7I46gq>6F7B3Uh1=qVZeuqVQ!i_;YD( zX%1i%I&e36EqQl9xHg&@m1d_h2WwKsZEaK?F#ut|+I)FKk7(1>JHeP}s{nNl zl}fPEyDkO^{s@3$w7<@_>(S~c`gHsLF0f8Xts$UUp+}8{eG)%fTL3I#ekEw3(A_2P z1FbzAXbw0c>u?rXh`iIE9H2>&bNiYAfwrtr=zz8X4Cx$SU<3F;0}xg1$GShi=aZo( zSvK=3Y6>=q4T6s94>(&gzgY38eHfCi4h=+A<&3<6Z|9$ALl82 zpYt>dgWM>j0)tKiDH44N{sPhSe*{lzF#;t~>-Xsdc*^5j09>e|5cJrC=2>%?AYj80 zk_c#rDbxB85v2V_b`q6-bZ{Vy&Bi(une~#>fHnh5Km80p!OtMm8c4QJZq}%n=R@tX zgib@Shu^nD=1ISWJ@6LRUQJ`5oZV<#hq~seTmQ(mE+Phm1BUh2UUkH`un=e=wJWyr z-n(-ElY&NP8>;Qm;)lYxG<_O+syGIJhAq{MDV@OZb9yseMxGC~()RB0P%G&+MBhb7 z00osEfl43EYASUO;`VlkcLf-EsJCvFkv$OGX!b1k7a}b_zQq>H!|}~M6m2dV{JHoB zFG4+$LD{?C=6L3kCc~QJG4av-!mtU zX!3?Wo$4<~8ZE}#i_lgu5IGm<$6lZJq#ed*)L8gOertIX4V2meE{+d4ugypZqt9!w z7SoXB$tXkJwjxgjqAK8yp$Ia$>q!};%5WUiaT%98sg-II{W-?<*4bV&-ck-PLC{}e zox+I|?JJwnvJ4|3(_QUw3{uOX74`@?JJ0kF;~%tG&IMsgThkIEH^60MVL`uW^fZb2Q8pdcxw@2~k>) z22#0HI+>-$^UT|ZtR(_+SxZf~a6lp;|Ca~=QR(g`G3u7_cQYvtHCQp;OyNu#`VsUX zYCx|h-%P)m!Lu2a0p(?lU5-boz)|*2I&hS^lhH>;H)=mWt72sjPU3ees&Z-3E!|a6w+53zt#qSYZ&l&=^qfP#gIcw16`EHTL-(D~yGG?-ctjUv z0e$M67>f?*X4nAsX6J(}USaWJ7DW~xVL=Vo;Yi-0eZ?uWsG#Up-r^mgE zylh(k201^{Huj(-K`?TEn8iuXFlKw5+zPdkXZ#HQB_!dXay){HV|D_?WAcA z*7rv+65wM7ko>d&T7;$j3F;CXApP6?w>R-{qCLA++{9IBF%;@)AHs`z_q2!~0yu!d z4|ZO7k~}PF_a6dV%$0m#96G#EejLhA-|k5z=JYtr6XQlAjf-G{`mlhrh}+y;bpTTR zdS=4)%#6>?(<*cC6`c%MbP^xr_4dv9hqPzzER9HF?S^6LsGK{BC-ueNP$_2Q4sZf$ zgYElbtC4HCWr4Q<&=yjijDWA0mCa0hB-r|Hwdrj#4-O8}z^&-B1L<@on;K2zk~`)1 zc9=+Lv_TTJOyuwMB+*{ZGC6PDWx{G`H>04VBQ$zm92{UEVS-Mnym+mBvy&!txZ>`1 z*MYkIv%-Le7utW&8^fSi!OrP%P`Sp>W_q>%h$U#P9rt<5QtVs$Sw?3o-XkXqr@T)) zDS@!3K)IDZqEHWPXMrr3LN)P~+BvUr4jGN8hwV{)&a2fqBaa9n-xGQ2sHxxpPGGu^ zR7s{gkN{Pt_kwlqP3=JmKjh?A8K~4`Ic8`iNSPMkm?3?g1SX!dH2h{BN%?o^aEE?( z42q2Fx=kDS)2nB83jexm*@pieC%4P!3*yx(|R~`h9GCn1Bv$freLNLB7+~=c*%s} zn=U)IONgeCinUqPrF}$D>qqp)ym||bOT;wJxq*+3Htjxljo%h&+=W2nWGaKT>GV=n z-(KTlU+(oAb`(97^&JIT|I;epOUkVp{V2*P-ygs$#$OOq5Lpma_kh^kw~oH{K4b**a2b8Q7 zSv_Gzkpr9hH-W$zoDJpi()ogsw3~n!HU~Pn8-;s&Lf#{^OZ z(QvI^7qVQ^i^+&HP#lWkh-)>M6R`rUpGY7l8lXWop+k|k#`wGH3c;ToPI^I4_{SQY z{_Nqu=)rzCfX?;^4lJ79F&gyrxqxwX#r6e31B8#ngl`QIUA0+6kQG@(%;CUiM>^0U zpq;_?_m-JHh`YQYnLQ`WX>g(a5#2#f%)Pw+MytWF4y#6NrJH!2>jQDmws*x;+TPK1 zuNQd9E}~81!8prqPF20NfjNd%J^M}I zMVs8KIiKcT9HVMa#To3Go^(&$=HUK#=MN} zjgrA#O9g3I*)arezp!;G*t~j666B;5lOob0mR+deMxoORxlB@|y~219G3KSJhYp7l_DQpEgS5TIj2RHRztnPSRrOgyj_iaE zhxX#8MAb$Tm0Mt3e1uo&gF9V<#y*rC2k!Dxgq?^ z=eA)l?eOjje)f-j>I{D3=k^`e!5P#X@pSTp(A70WAg;E@xL;UAkgqEz{Ml<6)MOXIk2itib`mNP6Cj`n6AXNGBjlzT@^2=tH|4qX|zHl#*)r`onpOp)jz zh~JLX1|u`>*dDlpAtA$ROrMc{v16eBrc`~EQ_Q|f;C52wrHV|aDke|N=x)OvHL*>bN0zYfTwuWwb7wnr{xZTez+xo5 zi9o7N2I}e}L>s*Z2##@B@51Ns=)f4$r(T@*g6}Us_kyTkTOx{fK3&-m*NE=;3c3 zKX(B|l&9W0hDpr`E4qseEU!HPyB#Tb04{e}cdlhfv!uy4_P@d>v&})sF}mDSFoV&~jPyo82@? z?zUv8JBHlat6zMU9~sv#z5^G2InXZ>4$p-8MG7bQ=e~|2C>Gw$X)iL(g)>%nccfv+ z#sspxjENpHvEMZl-q$6u61^faZqbvD8J5#rm9!a52w(t;EE`B%MaJdJBs5fPt!$F; zq2AM9*c;Ik2TG6|_^jkY`2>3$QgPANC$=}*vRm~&PFPT_{CQ6`8ykJYm!kKt)s(7N zxAaKCd&g3v*=($B?HTr|tB8zgO0E$qzI{8e*P5+0x*)(oYt9E#a)_?l$!bVu@Rk4W-6%$=t>{$28NI8cWw;HT(B$jvl>Ww&t^& zHKun(wwb8VZ{}{(Oz3lhP2y4dXoAzq1^&`BX;A#!Dm`N;@W~}iS1qkKO1f!ZV*?0U z_-tiDO*}4oJ|FsSs|P%7zT2Zgc$bZ>E=-9CY^U`O*8;&?*@aHBRd@ah>To{C;(Jhl zv}n&ns~_emQeAOGLNIhY+#a;ILr5uj<9}w0br-`NDMah)nT7G~4nDZ`rgnEOG?W%( z3<7Y??_ijT^4HiLG1W?g!)^6@*l^%WdNg$`+SC;AYKU`hqJ6kGaR%zPjSO|SaeSoL z-Bc1DI_6-K4u=qN;Uhwo$iI~LnJhk22D#*Y&8ejhH}uR(!ic%q>$#q$Q;w$zs81HgnG=! zZ}<=3x@S&_YqOp>CApZqIbH$$42N1h(;KI~DhRdPc38F>G>SBfy7f$!=pmUVB{*Sy zG-nq=m5}{WcDZb$BzJq5Qw!9<^&toOU%{3Ku4f&F*n(B-goBJSk?yR+2muaqI^oL^ zJ}fEU#=J&n2a?7Q+85O;Rpg#5Hye&0Gva3kj53y#Fq6)pCKFev)5Fi-Qi zhN$eDwtJsAkl2|-qQXt2TFE{*F&aC%6aC?@^7Y@x;`>?rbrwI!;%~6{JPS^cbrcKx zndoH-u&+6nm;6zvOK{BU0qNN$W>cLL{DkEwl<9;GI ziyhI-675}~mTQHqd!#OC?1P>kLVfqKX8br2pU3gYAp49%27BknB`GT7$+kYbLXbEN zB<|?!B<>c#TKzFdJSOQ_@5UsC@&l!erfk|)!-Gre~^b1+I?aCh~J68()V_M zRzJO$yp+rfKRsJLs`FCx`b;dn&`E)aG^akqFAFNPBkKW4W4_t%VLDCdP!nJ)@y3Jx zcVn94{+RE!w_3mw;p}Pi__+lXJ;aW$&ZZVR86j$X4;$MpT7H;kwK(mh5qjQ?ezH~u zrEO4Yc}IzPk*L%n3a@(9ZwKIiqF*~Gjk}(=jr01yG>)7nfEd3)5hYf>K3<8{b4kJlON+J6PB0|E>mh3LUgR{1#u6CzLg?OzV%%ATVa~@9m|HWBoaW(tRAV!nTQH2 z=1!zPzS(g8d(`R0REp3uSrKI9v`u6?WAgZsJt%FbkWc8&X|`LlqrIFxBidbD+e1&q zH+m+f(OzE(AWkvzy|{6Ifw=YJb@m=`6r2tj5Q8`0Gj%7;U`R6z%`G?th=GuO9nrwoQTn-KC`2pFQ7hE? zpAQS8`XPr7=Q)YU_2usUc=SC411m)SXBl6|fk0YsRjOF6PpI@X>e1LUHi08JQQRhv z8)e%lfX47I9<>h(i9kUq{_TW@oag>g`k_ZWviJVfY*FVd3>oiuY?cIzfT zg@-+MxQzYiHy;dr0M!c#;|=L2m;IJlYg-rTXp!$VKE5K`J}N5wN&ARrqsGeD^?eEX-(A4#=Q0uKI zpNVPxy=o}Zr2_KY)Ayy(S`G7>(4)ZtHO#xS>{BE(70LC{x2fK9wwa_9i{v@eOj@v3G%*x&LS=E(pi2EZw%-xKZ`@Jlg8;Rx|0ghyq0w7EmeX< z+gC?nxthU|7c7yw zlf@CHRT>>9AJPKt2N7`;kXzSroFB*S5w<7htu;c7^+Clswxy;e1Idv zkmCF_P6W-c#T^#vA=O74We0WOgXljaxDk`hjRqvVOv541=s2*J4_l0z$ zEW?L#S%+6E_}M?Ao_||0k_}(2VL_Dj6}GaAepr#LFA-L36wI$)6fNQ*kcS!i|7 zGUX3cok5XQ>4oyQKz`|L)3F}SS;o|OR`gQ8 z&YNK8+w4sz-<>yMQgYwMYwKb=^XmGm?xP)~p4E1+{|V^TTIW;Tqhk!~5wwwT7 zj@vui*&R3#G7YgkgZdbIh4hSZ?1Xy;88$^>ROKhb1MnW=#;E-N3-%)_|K>)>PZJ;% zHDprl`yyZ2PuJg#XYh5Nj^=B{VMA9(8y8*86nf|yIu)V}C(VsfJ@GRh;^4WpraA-5 zfZ40Z(_PS>$H%+p!YyEwK=){+J9Yr+1aYTt8Omtyj&lgr>8bar6H_NJCb0fY=a+b? zkHt^>ZG-LqOKh*`m5dxn`DUS>ZMJRns7czm5Ne}0dI8=E0P$M+ZO-DYDJtkQh_B>&quSnA3Km zAS94P4YBAm1xWSAvwKQdCJ5PEV?pw|;VLri*rX)9XAm|^Xw@cc5?JMz_PXqQF)lx**A2jk*;!?ha^GgpR@Qi6sG6s+eu^5 zzB1lT=S(LMG^_p{f?DgKc|_ul!7jHO48=FEhA1qErLK|A#Bi*Q?w9#tzrx~wW%27Q zzRaS-La~TfK`E|2<$a!*=Tf;rM6ttH>+o6yKl?|t^UwV{h~`}+zn9E_&A=nnwfJ)W zWMQtYiRUx8JtCg}H=K%!=bzNK^|r|U6ahqf7xBC==1boV5$#*r48QEbx=fUq0hPyw zqN4g|_|5p?n{`IV2y7!JEAz@|EnOIxM}_aU$QCOJxm`O2#n}_t#ESEqEPe__4_KJM zhzKlxj%^Q!?U0Qm)Yt5I(X`*T*P}LR@7G^^&$5YK;#*oD5Z}(P;6hL7;z=G=?$Z%rcli|d9=f1G5$26h`0X!gI8T-Vq1$sbe zv>&2?*hfR{x_6mwyp{obDBOCVMBDU4T#~w;BpsIN()vBIAJj{(Yc(c{B=^gH2N2n= zV}ruXHJ$5)^}JTW z&;H@Qfiw8IkE8HL;VKcFtfqP__lc4AupVh+-rBrVl^F)w2PErbm6vLD1aB3ZjY5T2 ztA+LIO(qWv%p@TDwf&B5;+P<;U@|~Z(lDm*9d;m9|;!nMr&;WB5^A&Kdngjt<%5k?a#kveyq*(XI!SK8tMr|`gkWXWNm3dms;CLFQ?!ePI8oTwejOd0m)Z00(%sq)8a)=?P z+lEU5m|b$F=S26K-I$(h57W^%g+NFw#q6>T|8EO1&XTT^uaMi=1LkW6hfu`0t3mJ$OGR8|f)|X$}V@?SEo7GGuYf3f(#S^OajzY(%W zl6L+zivN=YPW2LjzX^CUc4HvP^A-H;AMP1AgP*$y#nG28j0J~;d7IIt?>u+68pr#0 zse|eew!76~bp+e{)KPT|+dXPRJ&Nu9>M?a3+r4U1O<}uFO{>STeL$U1GuS?;PO4Mb z?pIH!C$W7EbJ!kK^Xfddhtvgi z5!=J+z3O>vkEl!Peb^pV?^iEidrZBk7OGlddI=teqlSf8YOx4s(dxtzcbDEj(^j#P{ku~Y8 z62j}Rpx=Zkn0yG+?&)wNO9lp*UzKVNGlgRke#-1;I3b@xSfC8i3};=A z**ayKJ&%Sk#hFKR-SPV1-2s)i&>vn9>u`J9tO-(=4ba)H-Qk4GNSxS4I#fk`R!n{o~t-^tb+jq6wsUGO0~ z5?#EVEcB+T)6{HKB(@2Rlh5<9{Kx#-2pY9Lz^F=^iGu1MA_8~g_5pB842bYACthH1 zsDz$45s$`bMezuH6&2HW3UI@MFH$uLbV8Sc5tK8|sU6uf%rJ=M;2>Ywsh}%iF$;q% z-j$WkHt2OWk-^BQvN*fpIaJR#){iy|^=kPB@`YD9$&~AobJhV;0c~{qCe4%RPJt&6 z@K$X<*MrdZ`JQjk)%K{yf4k^m!GYI7BIp7O>5$*cJ@!iHk5Hh)QVqkJ8jXrjKyg@S zMC2sFauHDk^{(t-+CilkG{xk|l9xK6wAQ!*JqudW1TjsT;M#4}8M3L}sZrV2dAj5A z+^@<8-`@4_%aKP1cP8`n4C*p$Kt25$MH|of% zFPUm}Drv&c08HDpZ7=9#NTcOwKhTP4up%tR%4LWFB`&6pvfM1)#H>tnk9v-f(`F$; zE(_STFN!i65h0qv-AmQ+5G2Fz1wpVu=4${nH6xQ=;6ZOmK!%J$8hUoATjBkkY-|0d zQ{E7giwBwWCwOxdV2_Csrk~vmROh0g0uwpY8&q4X)B=elW@W?sa6q==>!{F9$v(W* zBsqZfczvcxZtq1H>7yK^@sLey0T3dCGl`F06A+$|VrA|eR}07Lk9s7l0XYaaxl_hK z)$hEYA>h7?#h4d!1~nC( zxrqax(A?omggpltI>m`*%4 zVLhl2zLI_qLstx3KM!7f{J?9w-%LYA zPSXlFGTWo~)!@ABHZTpj>46IUO36!Jkox#A- z;GLX4QaKE*hT5OGBpNCcqrjhBI9DyVnl*5*t4PvN&`BB!1R&C-Ac*qY#*krUVF0=ysd|u5r7$^JxZP+S#Tw%4)f;#iy7yA8$7@`Foh5LSjk&YP zHo4xodAv|AU>2BqmO7a^eY1MhsTP*XmFuF^h;bo{62Htfj7A=0284171MN^YERGO!ppSroW#u7Lg3c)gupG*Gi6F5m)OLt~$kb#5P zLd>`>X;#BaVp-kXweaXJbdynV9YlH zJ?*TsvWl_V(p)=bWOcljkxo-|aeXna{JHXAlw;f2GO&p zabA{&)M}JBHYUN-t7AfqsAvraCJo}~TF%0kSBFAZhpr(%p&5DeYFH z6Lh8rw8*sx2@F6q-}F$@J`lLmph$R_0;IgRtwT#6?P^a!X$waz(ruD4zH%+?fKsYY3;2HxrB4q2I{jst_b2q5HPo@!mEnQ(3s&>d+nQONFMCH}|Jd6U>dM zX%H>RKx^6_sA&-O2A|q?$DhWVUuYlJ{p2E2>zV#hFRR<&c9K+34}>u6(RRsVZmqlm zzbUe25zWX42E|JBmD7=Gy!+;G$LzE{LVt~`zL@6w4O_+Pr{9Lkb$1ktq;yB2Aodd9 ze5LuObV3O&5dJH@Q~D`8rgrY;v|O~d%i@rH5tZy>W=Z_Te72aW$o{UF4!m7P&xWAz z+z*oyX+9Z%`BIy?&}DV!Mc#GNCcE$`R*$gzW)fyRi}tIwgvyS~-#mU20}vUV=rOJs zx{M){YeQ!OZe5ITBcl{=5GUZgfQmH`Enq!PMp>&-5b2eV3LuQJPBb{NW4Wbr9f1Uj z>wV%xwFD*f7YQRhd4UiDBa>Y8@$KJIjTSWQzf|J{H1ESsH)sF~8M0xkbpY6PSF)ng zQ6_n&fgu^#+SJ<+<4v@HhUjE5-oNM4$~g`Vn&vZ^6%)4U9d( zNFHN3qb*@r0NhEeI4)YyNmQLrV5@Bl2f0TiVJ*;G$W5z!d4%9RQS*EhUF6LXJn5XY z`q^J0A~f41BKFg+Sm@0*^+Oicc-SYy860%OTAU>$MD6f^-FN^O0erdZ6y)tQ~XLQwfM3$;p?;*O`; zP3p1`3`yZBK(MD2hK>dAKV&hJjuD!k?;#^W7{Y9exp|YW7(TcqXU;nU|8}eu``?fx zd_86Yhy7e{I>O5EAdEwYssJ+!*!0GPiyjAK2^p6lLF?vnEj;aR|2KRPpHx1f9C8EX*q7cy} zh+^4q01;o6#S*G~*E%OH^1!5YW)94K-1r!L`<*k7`k+#8p>qnBHHztpF`YMzRv@}> z-*g@(h4@`FFnsKc7?_rqkvlsM!O@;6gz&q7^rqc&M^Fuw`iH|9W&zgZ_8cIh_OSG`f3+aVH(s_CBwpw$tc`h4!)T{^>u>JG}ct9rV!K=%6|n zKH94Ux{nZ5k^U*YRL|S%iyqt6!+4_Hw6gOG4#Kh%>{8pplH$CiJ&gKA3Zq1ik*35l zA3QeNG8wILb?E3raw-ozBvu`5=L;D^tFsH~^)IbqU0w$KRFzH&AFz7qL`t+)MkUX) zU8LRR^eEMqOVWPNAGua7Q+=BtU`J3VB^3XW`L!?i)^LLzbjxqCo#X#cKo{0=6p;5u z1tQt+BYfk4d@%@QEj7IM!%b525eLSF33PzBxyd+y4Zu&t0{kPJ0{rd2F&_9g(9Uj( zH*3U76o?TvLCYlskWmk4*Fo-kdxQ}pgox+7=NJYas=(DDZ2^*Afh54s!L+}~xF6)< zhf23D-u%K+3t`8|nNg_Mu1jQ)yV|%(ozPwpHxD&{_TzyX>}C64q(VN{Nv=2AL-tnB z_ro!TwmjS6kX8XD-sksuRqP_Pm4pc8QBY1X%`c30M=fbQ`!hWQ&k*UXpUzmCHS)4W z{1)w$9NzblGQo;0E~#AbH;>-364;nZyR+ng=uF;~ldvuQ;33fkd!q=ER$?FfQ zBjf44(sR6K=;>c#{&=2X@kfjRgDm)!zvs&%2;bLuhBy6+XUbZ`2bXySE}GVSOu`^+^E7_CFWMKT8X<)+i!_? ziDGFsV8}v@DB^s?K(#W*=V}!k(Q;LtlV})J8M@0p+jYCMjeNip0rRat zxTIaHH(xr!tG>O~H{{Bg=zi4g?BNRnZp~uesd`oySfh_JcB&0*^HL(4!X$fs^45HeN3?;`#(qnWiY*${>T{vb|XUSF>wYm+L>R@X4Jan=w9 z4HXI4U7O=ty)AQ{(j@e3`w{!7|5)$|V><36t7|&;OxT48mq*bCZ`8Vo{ira}#z-A% zc8l=N<0ilx)^V{T^)W{3hb-tQE-YiXh=VBT05Vz$bO^BS@DLc*g&AZLMiCHD!a%w6 z64q;SR}0u+%}~a^PKl|dy$@Zi`x-^Mrw&uq>ZB#+Q8Z)ANOhQPaB>+{mMD01Mrl6& zjXrSlAUGMkKipfb;v*I`p)ngha8b~R|5j{Q!U_H1x8ar^O|iXfU3Z(e016Q#(v`WG zkS}00B>K4f^ptwG2=QMM^(fKAdgfNO?pAM7)%pYhq@^v@eAqT*c4xe_-K?^B&^^B? z=oF@Bv1=kw+!r%c90_uW7k$>FtH*?VL>pYHGG0kYD;~hkalE`8|3o)Bgan5I{5Asj zNz&S+G)uTaSF zLi>u&-U?!nr4n>P zGD$pMOyMcW=Z65v_LIF|(yt;oW%_)Arq|K%Hye$hJsPgpTeMM_qmU-2nrQkIRY0uN zLa8!r@q;a_X5a;rlX{CohD}seamc`{bi&`AiE>#zh1WdvT_oahR3lO!sbI$7yG2@< z)NYaZV<{TCMfRISr=%X%}pOR6Nfl^5!>JSXD1u2Ut-=%8vX0^Ir zvMU{$>8$RL5H}JJ2*#4bKRxwyB*T3eb}sc6i1+v=i#LcWQ7ec$_h}R!-o9nPUjp!x z=;rh_oyQ_Mf&(mF-Av)E@jnUFVHdcOH+dQLN+$VPx8l?`n%dB8_VZ?alf3Tqc_d@h zH;H4&yQu&TpSnBlmduC~4S7-%Q|B(S*BZF&^n3 z6Uk0i*CWV=c-W1nobN_~ZtP15qC=9F{vwgXHy~;0`!`2~XSakuak=Dj9c}2}v5DHB z0uI{u8*t#n)liK;*L%X8r2x1vvkL>ES2m}D zxAi6qPJ+^cbGZwg+86-8k_ISeiUYjV&5V{Xa7~=L90M3;I;2ep4C#7HcV)0*JO^{2 z(cQ8LG9PRYFM?PrnC9mkn`ocXk-S%B;ddEcU22x=wMwmBMeG!YSL~^>W~T``7dsz0 zi^#aei9~S->2w=w8x6N=!?uO@auC(Y$cwbb>}zf^t^a+jWExe4jCb-V&qgJkURYJC z`wG!DhQ|kzc^Pd%(!p(u0FXy|qfMT);P==G#fD8gr6)td9tYYj$z*-*QKFfQJE9=j zD-4tN^6PO0sSFZqazSkk%hQFUdhcj^kLma{sM_dGSU9ZAS_nljiO?d|Yc+!6tDlCe zqqy|DkR_t|yM&I6MU#{?T$WiUNr{*C{)6$rq4FN?>e}NeV33g~#kZzP5f(6C-p>Xv zY1tr676DM1b00)y`kILq065Gw5}G&k{16-O%Wi>%5XQAlU@{#FP2ZR;%8yyDFKHr;uwqB`)BL%PR<1;w$;fVAmRFfne^IEbbLEWa0 zriwPEwu{h3RlEoB@ z$63sJFn&$5_hahk;(i!&^INXwom82=89*$Gl`uV5#aL=MD}+)m^| z97Ga}K`d92K`M*wxqR+ETzMEP*$v`1l)FDSoZEx5{M(K*+1w$ypGmy;<#y!8aL*|A zGY;#o;Aj8H+s@$U-a_Fg!PrWQB{hsh4>>iWwqZM@M%8v~^J+}(z;;;eRJ*VpQM=WB z*ltsM)cx3ws=aC-w%gSM>OpMB)P8WohxAfx7qk$*yu=k|UA?M6;YQiv5-EjcWNB)F z_zIY+WTds3<`hWT6k^pMM3ekpggrC%4ah|~6FrvkwHL~#pFzMg?vis#v+t8|d}03k z`bu+^U*xYzO^YF?QepA%B}|;cHE){F6kIh}O?U1D*d69PEzM!R6OSdRO~$KS+FMjk z;bOiF{2sKVA6w5!FU8mVNr0cHVLrpJ{UEzx7_@r&H14{TWKUh5?u+r}?mF252zf0bZ zl~>kPKI8E*h{WBA{U$?d8~}S)V&O`d(o{5$pZgLDpCFrnpPs+-?IU_r{ATJEF!B!! zTp9Qf?-PM(BiWJqgj1`zC>0DkVs`V!TM%wgZ>#R3i z6HbgzNFb{jS9fwZ%MRlc+|DT8Tv^3re|?P93bI=OpvWN_EmYEPD4Z$P)R|YXg zpO)^SI|b#l<{H1jFaqXfIW_^dP_0}C#RIVLS`|dFB2k;+CN_D?RNi_iUVx*aS{A>4gE zF!OV4mi?IZoIVOQ?dV)5&4Xfw<&5QQqI|G7baLLucZQ)zV0PxkGK}2)y6!gVy53_C z5XYjA4boMh1G?k7srNuUE;-!y1G>Yy>0zw9T0T8~V!Y5kJ^sXaVeRzz$??MS>G7HI z!tK-JvNQr_#re1COZqmsK|d(`3&Wo=-B3=^z3>Jm13b=q%SaR?9FTA*E6siYm+iM+gqG8yfU<3b#05Y`MO8Xg!SjeHIp?e9!C#ogPLV zPodYdpv*ws=hsw3y~ZELezfu0VW#!yM{Ubeu(OLdD6VDt=6n2X@hu7;#Z-|&3LJX1 z?)h`IgH?x%qeFS&S^AS6x8DzmqC?%{zQ4!Q@eO*t7alxkDMwj49gr5(DQn-SFHi?` z)0yf`>e51+6T5tnz?8QaBQW%90~l`u7}@w?4Kg!C1LjRUDdaULMrkvviM9x%Uyn!_ zqTi1YCZ*34rh0O2tKO_3y3Wp>4ukp$f?D`KK+TMXW1LcGGNZ%+eXAA6V<9r$)iBg@ zz*mwe6#{(&X~Tl!U5sSAyNHZmH|rvLXl5G#;ofpw&a5<}PqIC|J5Q(xr$h&W{dl&Q z6x>Q>AxKavifR*aw^~RTLva^1%9u3;)j!>S>he024WnDY+lHyvT1{m3$~puTY>tq5 zs!9Qd`;5_KijzO)mZ2abdZRWcv1hgiFL+(}1&*{1k`Olo=Pn^Mtjd&W~o79VV zsUZL~kEOI=SrVvGl=EN(!)=~8=2kaOx5lCJO~O!z+Aq(Q z0biNSQ9xjYGgoL9W~Uu6NTkEdmYo8QscB|NV;jX(*=c?)alAcLKJFYZ9LL3WhPzEG z`K9*@Wj_iybaLKD_3Qmw;^Rx@^REycp0#QWu_;)-Kq4pN5aK-)LWLdTye}3No^8rQ zMH9io%aTIPhCvG86?iBY8?^lz6M%P-O&*w>y!76M%U3Q;PSRFY%;F@r1XJ_&M%68j z;Do+|r-P%t50X+jRFM3g{eYquv-ZGOs;#WU4GMcqV~sJ#Zjn%8Mbv%i)swa8Bw8m9 zPCe_12i&19z;O(-!r8(Y2?oX`2q=jIiEqgCx~R}a4z2GDYIbF{qQitnMDIT!-IT~R zH+Qwa!uD%L2pv|IqKs>IzbY0cc0^52RQa8A#L+y9ODtYwvB2Uoi`~8i&f>ypheloIr zcda+zB)dBIQ&7yGKie$70MF%FDDt(X77(DvQRLEMG@N8MmEaIUVm*$Ef$*pvBE%EZ zIKDm15NST=$VsL4`FaRQX*-f_N`0rrIAHAI>v$eUOraCYUN-MKTCkLElptT-|8%?Qe)b#Q z2sK=0e8!+%H@sgx@olGZCW>attrhx6#G+vtDO6541OliYLhJMJ_J$zv6;ycdV3-ls z5&e9KMC5wpdQ|ZC-K^Feg{*Hi+`57Ehi(Wi@9Rqt_f&BRP$@KL9zVB$A_^#ih`v1w zB8;!X1VRuUopbNU!xnT*N=w2S7kM3GwIYHybB^d-T!F#jwFZLv6^5ROG}WA!6J#+O zMO@R(5R}9((vy0&SY-%4Waa{&xK5<`g>Qb8!5lco7+`_8p`+!c3T$7?N0GzV?CPcR zf@3^_q_~rk(0-oqgMO?;M}T)f^9>Xk7!w zXY%4dAA-hXeW4M4(n5m0BSZndwMM*R7;DJ1h%T8i0SXAWCk_mv&yM+^(BSC@32kCF zzOy>MP75Gkwu|{j4_rk)2B->I9_qItIMI3Px0yZ`d9fG~a+h$DAsV9z)9nj=A+^<7 zEexqB5RUx0PonU+%(cX~a6hRA5E~=@4L{sRk_k&Rf#p5v1<4*?K@Ikgb#rhfnz=%Z zO-XwrlM8xan~()8MD;;v3h&fvkU%NX{1jWxItCK${tVWz9gb^jYiq4$c?ouab%xYz zIG88yG;jNHHBHa=Den{A;4x}dY%awQJ#k8eWrZj(I#{tkLC4w_qVbg z+W*PGIV@%;$v%2A!p&Q=(7oR84BPM;zw@IkN-SPyQD(8kf;e}US@;Agdu)yKDlV+> zOL;1n!rY~F;9pLzhpltpiZB%Z*y)I*bDx;i_tNqte*8J!Q!qjr7T#+DCY)Hsm-O&~9{)KE^CK z$`~79`8G&#_Q3he^NSyT=|bW8#TQ>Fy!7(97cR{g#wVwzUpYBHJ$-)hyuNU1>hZ#& zQ(kv#%q&>0PfuT17|(n8$8X-eId${oRKr=BUc54W<-+_Vqts44K8@IJ`^c1PDlFye z1v_!UY8X1zLh&dWbopwao+!YhcE#mEQ#o7LJ{VX@&f8^}0XfG$|3Q%L*)$a2?&iwGGS zbshOxEaH3}ULkzQT3O)(be?r&v8y*5&(^DGyQ7>lZgpk34Er3L%rB5uuz#K&KaZ@8 z3;_g~0-qoKI5<%S;CgA5LId}&F}O&4B&Yzvl9Sk!H$VP8)og z*hDV@J#?>eZU1KEV(2VPD$vSkWTxEl!y0u|8<^G=P^+`T?t2w`#qNY^ZPY#{w`mJjE-f^625`1GzCTAWq-N^dp3Xb__=3LF#o^E0yr132sgThNhD8K zB=RaXy%=&H=~#c+TIm3ePmAb8u9MZ>vy)M+wGG$dr*|^-Mp?Pq=O7>b!IysLu2?TW08Q*QRsA3j()w&zHM?TVf)`TE7Z~tAk zj+)Q_Qfp~TWLL|&9%Zr!iFTdbi<26xago4a+6hDo6^t0D_z0M*6B(0X4 zXyKA3{hzY1k&JEZEbW7_un9jG4-&@the3h{gSPbA6HR6r@&ID9>65Au`HMud1dBcL zrmbLBIw;PwcI!LCN3wpJH`sd+u)GNStBw{I_PV4K%EN^H1U}EGbQ5N>5WBOcD1oR< zxAx^m-Es{8hgH|us&50PGVWCP3@J`Y_fnQg|zfS)vBo$dpiowMaUf67w;Zavq z|C#4h%qs1BsH2QU%*<6l42pAE5y_`F&gX6 zpwj59v1J=9Tox@Bw^_7Vd;^O+EEI3HqQ%odWl%|V^*i~&WMcGSjO=E#srW5Mu$L8u zwt|cHkIK~<{M>It;SVeStQTKHCmrwXV5hj+TEbkdCUO-cXR+-7?E993(rOs)`$4fM zy-$W)lxQ@*Znausbl9v96>`! zGNkBa7g)4EpkH0@5OBeHWtoN6Q2`uW(3=BK78e7N{&|pQdrERgMxHuSEh7_k^|@}q zZxO`KH?mvU9-)^>)ykF+Ru1|93jyj2B<2r9D59|55S3|ghM)a217I=3Pk+J#;Up7n z{#SwE?k4SRVJJK?n2iIW1Y>Lp!YAG#4BuYwb^aWv!$9wy5R@JZ`rt5~bUOhO(@Br@ z=%ig0+!Ig`zJshSTA?xH!?(rK3_M!ms)FJ;8{~a+T75KrnN9fHmO9^ht6twnKae^G}kWVl#AeyALu>o0-=p=4MIZ$r4US3;L<+;ObIUXO-8GQTplS<2NHr9Oe zxmSnR{W-pxMVdfC5%$lWy`l$M_~v&)-C9reiBo^ac(}KZrG#M~-ITQT(-wxD_5{|e z74=VIryFl?A~;V5>PgHdM~-{#W);yb$2t;Agm*^r6(uptoVoBNkC7Rvx+4GBwFJbx3vs3ezUwqHm)8~%NpZ?OfjL%)*msvpf zCsyY#XioSEY$C+!d=lrLvvm66rWl5n;<-0~wG zh9NjK4Hy%!^PjNa859#>chI?x?2S@ADzp;eHc66+T9yF+H}9_?FrGdeJLolk1n*Cc=<4a zff@$dFKh+`9W3Go1|Q3<^$INjG15hkEuC?EoyW~S#Xc3}W)$+ptN^*oM0c#oagIem z?{~jq`xq>Ch~AfcUok9f3sqT=vTIp8#yAebU8~6zzv0J#I^)$9xftJK&YB6d8L@WW zjQVy)T!Aiq)L1_hN1nE=B*-jY z#BM|00UWz<_>mctN<# zm!9*DBXBlj(?_h!os`y{FxOoGK(Z4V=#-`3*9s;c6B%m{!Ag zvH?wOWTAaQM^2cOUa?Ugw;v4<;0WiQSXvI_9C)KE}b-kLwxm5 zhXH8S9BEB)jS7c2HZj(1E|;Em{u0@Yes>A)F6Map#FM2~R3U#r7^~+64pG=Ol9XuR zsdO@*+9~VJ>6tyF(8cu%f-(IrwlYgjp-wkr$O(dx-OQxq!@TVj)>JbV6cQ6=aWngw z*Xm0sKF;EsS$qqNzsTa-SiHsJ+gW^?#dos!42$n#@mE-Uj)e~m*%Oz@ zo4D{tkT-4tKtm!Aotz_&X&reamJ3)mSip_h=sFpPUP@TUEC%vzkc(P*GlheAPRg8>><-FoE!o2x z5{`s09I3^M5gZYew*E9e;TztF(YiwecpwT)>N8KbFyWRIoJ(AsO`Id24iKMKu0-;B zfYAo|zBO1T5P}3X<6e*;93XXVDh)9XLh8oCAv9Vzh+4PSbUl#MjzDihrPZ=xA@SW| zNWdB)!Zr+vd#Nx47y@S=KljKcfH5gnXy@!AjZOxJh+0iV64a9;2cLQB)ES`;+)PQ#KIRXqnB|Z$2atW z9M|Y1DT$pDS&xL6;3*3elk06a1jK*16+`{$1_8GjRv-4%Ggi8~u5*NKS zTEpzoAQVjpW1Of2O;?)VPe}QC97t<2$bEDtv#{!ZJ_HX-Nr&*!-Cayi1n}&*@m5ZVBr!qqjQOo%T+h7?WQz$u|D#>BE>m7E7)h&X zz30;*Fj%qrbQc(!3Fi=o4)8l*m~IAd>tj5FB#!n>JoW3q&84|lTQg5Sjs#SGwhMf^ z4lwMbu}X)tDe(_U$LFm-9!2`jWP49H)$`U~a~i%Egz5ZM7T?d}2U+|L7N2MFH(C4; zi#T2DdvM_`egZGb1X`v*%g7H2(i~o|;FA5b+7;tfx`R(Bd^+3s{TIEc3yT@{PK%ir zaPVG`D5F*Y`2nny%m8X83X@f*B$`&j>BKY0fKaZuBf-?{vpmLN(w|12UAWipY@ZH& zrzS8LVg0^?1cChZ27>*)rPy#pdhUhgd6 z1)U5gS*}qCiQ#}NHgyu$JBb?s9E%>8x2_)dpZJ8vM724_Q9S>S05IiO&YTgC;47R0 zivukFUJsJ%?C65zFYwa{&0NCyQ9Raubataq!4zXj9|R`~c4o#48ZM{D^%hey;Vx8n#W<^NkOZ<8XQAv&E z7&AGN#&CFibI`^B?eFu=asBjf;lj^YFeM!x$J0CY=k9(<+cM@FWmFd1j2cuqY_mG) zMdu-YE7HT)RyZx}s-&dENM7cQWAX`JC4Z!G+kZcV1N1@kVD#67O|wap7`dqFp_%%4BIc=jV0lE3yE z7Zs<7(_?TG0Z!9YWZ7Eh2+wgy?y=;GsFa!o;+Y;CmNCp9fe$jLl@&Rc zWj1b6U3A)#>x(c`F&7&j`SUP~n&Bt&*KIH|Tj_Bq^Okvz4=0cy?OdxV!&o>} zTFyFBM`D7%S;wM@7-ynO0xAPBh;VT|k#Q?W6 zmy@#O%B{KlLL2$+OHl$eLK~|>Je;rVM^-(YX@3qcCUI=J%;T*bzDti`+%BK)7UYJKCGTpPhtCrdRje$ zZ9zS&p2PNlnpLN<9anSeJ=h*pXVh7252Z7u1W`KB^YfWo#c)FR2e;dt6;nSFxQ`i|S=;r_=}4E7(q}52+7h`?xBqk6?R3 zy{cZrc1C?vm9RajURPyoPpKtU!S)HIR2ADN)v{W__9?ZhYS=!luBq$TKBMYt4clkc zx@utioZ3)tU^}ZYQ4`zKstE?(>Wq>2I1Tn{wT@VyhEx7iFyp-6(Kns#d!pYAUwWZV zO=}}d{Ip&@opuqzOZK|v*Fr^f85n`pL8?t}mhDo#ygJ3mkZ7bxfve)wPsGea)p2l> zs*H-%B&>EOpUSvC11%I!avu*rU#-_06q-&I6Mnm%#1vI;^&3q0<>Xyv{XRXuTwbr- z=BvG{H7u)!Y$KS-d_t<{%-E`f7o5eU-V^XgoWQk>Iaw%&{wV6bIN{ZYsD%yvT^o2t z?povGKK`w%Gl6<%biK%;ygC!CM!xQZu5?CkPMT^ZJqsUeu(A5}-qc(Zf49_FnyTJH zL>=OrIBk0zbDn{dWv2#yjdaDp2_c8u#^=s8mh=qX(084*FDN|keVn8r8DF#0rp?RT z+FE1%X049ALx?X}uQt8c;m)}=3H`%rCmWa(Z9wh{s=ZG>sK3QMQO0TBh8_Ju-FE<5 z`i>byc(F#d0#DAcN-}IjZQ(RV8k~|WcX;o_X*}&uV)kA;D#zrqfjreL<`3rJar)qOnb_U@WoSqk0A zWAEO=%kS^Hy@l=`sCN>_{_R)8w=ZL!y1f?%a~ENgs+dpE5b?VyL^}-G!ZR3+l()(P z#GkJ$P>dpGkSY|^CKv*eW?&x*(1C_P^`es|-xS5j(+hopEVB42?s@_mrkCbJY(qgt zY-XmjHvVzEL2c;$_(>*wZDPUq&@-P{M!4;#5?q7`e)qIlNCsvWZ$TNDBw!fDXakc= zX&#d;y!XqgJuuN&Be`ulbpUGR{7jj`B+CVja9*?;*k=|CLen0?b-=I>O-DKJFE_n61`% z9Bm6##;4g9F$^Rgz%Y{kU~_r6f2%BH303~yi=Qkuc5D9~Anvu@#)<@ldJ`ny$QtXB z#J7(Gp=1UkNTCQ;0m1A}9bSy#zJb9Tevx#=URFF_JU8_1P+pjGp~Xey?O(P z4_XL?=!4xQN*`R?%C<7OP#j&y_ui7!Hkbsy0uCx;#ee4E+6=8a8BF%8tVYqR!{_{l zV>m`T2WOQdixgrj)*B^x@D-ov(IQFb*WIEm)Dvnbkt3u~h-)Z10W*@>#v%jTAaz6h zVCkiDh-4rX*j}3jsf3v~oD}p$y0o57N53-$%iN%$uZ9|TsEZW|p2LrX+JVP)JC0!I zRzc;!koJnf0LWq{BM$Rc#N20@($K=!a@6nHoUd(@J;trdVb0rgewAqapIQ70iVl;% za>i!L`8A&Vbr$~|1w=`(-CyGQNV~O>HqZ!?-RpkCy{X&XSdBbmTaK4t#X8gSbJ!uN zQm(FmWx+j>5_be*chUOfN)pyp{~U(9z;0IMR}iiC|8;h)v2h($IQ81=^?U8aX_`l~ zwbM3E9XDR5MM_C3K#?l7HIG)Pit9#eU&nTF?Co8tBb=5!aar?Xp2|0tBOq+J+H4FhNzFdEfMHPnx6IneYzW4v_SpgCn;2xbBOTNItumg>8xTofy8=SM-TWq@&b>GG53vE!yP7}t#O*Oo+AEtoWxAX0w z+7*2PnbdeH5<=`dnCpthYS`ve@GOJgjhoGPcwOkX@6mpnqe2PwLm@Q*!cNyAK}wEH zYPTm~n-Z@RDqB*g-1hU`)7u>r@G9;T(%r!8@q6 zaf`-<9*KT0;GQLdec1U$N@=^nW?mZ-&N&S!=L23|Zm3X7*YYvqLPpXK>DG>E8=Y0r zLC8u15!yWHbwCn^A`dRX)8h1Y7Z4g1Cv>}Lu6-z?`=+rBM+xa^#*odA?Fq z&k~PnY0dEo+-x2bDL$-H9Qt@vO~$V{s6;h;++X zS}0NQ|4OR-jo@`+o$b?_y>Or9V~IhDZbebMIEO`XOFSe+PAqDCE)pJyy{^)aM_9xd zq-r^&pf4{yQ-!2B=yf{^-IY-2p6TsQ97>F6&*XNg9)2=KRa;aIpPI5oQ)cXDDpi$q zVi+p=o+`Gkl$in?VfPQx;GYEB1pgxVH^F}h{!8#bfEKKT`epK7V-_1&OO9aO=Se&D z%i7o_=J;h9H68lx+938Ds_A8F+uTnXB5V&k-&f6JKGGqmEp52?1%h@K>}(~~XLqoW zh$ZHSzmT9nj^CbH*U|u7h}?q_+|Y{By9tLLa!(&8ipIc4@0@5%V&!@bfm6($2&@LZ zrm(_!5QJ7`nk2+Adq|>D3OP#$hbrsb;+G?>!1y#LO5s*<(6OBYl`upn)_kl?95gK* zWgDfreksO}?6#^ByZ;y#?JI09DszaMqr#~=PJ5Z+Au7D?v{#%<#29duqp@*C@|4^& z%EI!JxptXr_u?Lu_L#d~bL}(Ne))DlpnSg8Z$`mooDx!>5xa19giaZTS>2F~pt~+Z zFH9`X2L0{>L09zEe(rKSBQt)nS=iaFww1|XSiVav)t@!Rx*M{cr=1NB$kST9x^^=5 z%~1ON!Qf2ly@NBUckVMq?)!%8S;dgnCVsi@xnlDkFJg;nB!_;L0>EJ%6Spb=%pr&z5|hBOB}~MK&I8T{cukI*qI#0wuE2ZLU4$+N;-V;Ij-m`=)~-BLi2@ zj%XhNF`(T_eSu+ks%#2ngDN7i`pF{JOgzPkxe57 zj0P6ht~YpM;`b>X40eXnK`o|6bN^5Nij17d9;{@&&=C}_8x=vhCs$C621{I-IlB}J zh(_;X2~6*!yN|0{@}gn=09jA!9y>cCsV@%eoiBnuVcW8YM`_|$dX!(geEswr%v>e4T zl^&#_aEq*#yTy@~qg=II6&bg#lKTU+Hk2oP<*Kw%VU$~~7CqJiBfGd!*$sxQWr@l{ zu|-KJw)#=U)=I43NkcM7K;O=hT2q%&i;t1i20fxYD4T#r^r{gvcLPpa__dlNz5ten z^Hu>vBj%kb3--s&5}uPBoZadMm(*dr+IW?^!Kf(AQCsL#gysnMQdW|+9YxH>(me5R zXWa?^fulQi!mAq79Y#AvUYW%oezwjirTSX9lCo<$=&R2#iRVel^_dMid4o)!pxb;v zCQtNvqk-N${&-Xyi8*2IOMdLFNCeu!%xHdf%*cD4)ddJ$Cv zu_Vj3siARR;gF1hMlm;xvWDXhOQy(oC*j!1TZ_KPlQLr+y_koxz(H?_%73k7lz#7Y z&_Rt=vo2k#(KeAzeuEQ6<%ysu^bASYgz(B4(6(e8K=uCB_bblO3CCON=7Y$1*w?nw z!=}7uhTa;(q&`Pz>|IHF9y-DGGc9?_@(N3R4&n>=6Y6uK)Qj!hhO~+{gl(~~>|wJJ zr>7ehh7%IlKw?&sF~#$N^RV2guhg!bsXXDYp({JLiW8D_2dZxD;wC?tkNe9w{bl6b zm4!y)X(#n7I7Fk6{8w5RZZB#hyhLRDDDt)!wR0-T`r zZ>2{PMb(*n4qqc$I3kfZ`t;_Ai;YziFakAV|6t6L0OHq=odk7hbz1dv>RaT|wM4S_ zj6ptcnt%jAXjT)Q!evpMRN&elnNE*SDmYJvTf6h=&cxH-un?c}g37V_*-Q+3yw;g018A6$!x4GUC6LY`@EfFr>Ua#3 ziylrYK;PzJ!5*CH+Uok-NSkop8gVq;gFL0-P2XYZLk+hEmKsJ4*Evv|wgg?i!N^t; z`RbSq=%A-fvQ!i&$<>@T1#kDxC*>W6~0xL;xr30wrO_P6c3*M_noWY@RszmaeOj>DrpH7W2nq&g( z)rycPm%rU>nOQ)#T_G>dn5t(Hb{H6@AVdrlb`}nnlI>|z=7G= zSKkyiXXqj6h}N+Bu7M3=moMSCCd1oh??EC&4oGgPoGp|#z1dYAoGcUHidVTpsW^qn zqjG0cNpW`_QP4#$unv(rA6Ma$9Ao>c0f41-;=d%IfM89nhjtoo=t<3lsAX zI`x}6I%`V0raoCym(x$_#t1dK@rMzKV&9}z@S#Lz67R@?)hHM)x^h(5;32CNeJ}^WJZL{#V0V#H8Ouu4283>e2z+erE6m1f-yE-8*(zXJ3>@@juQ`SqX_i?x ztcEfut;BH1$*@Y)*nKMto8wpfQ3!;844`=I5zTpUG_(2MMtx;@o~NpB z#9Dr;wU$Sfokq7La2R<}|oHTNaap3ZV!S60gRH zAm_vF%+VAf?aSUc6`0Nh8h`|G8u;Thf+T@|l3)U$Ir_RhBq;(&B~23G{V}|A7KgO( z76)OgoX0aA2gj9gA}1Z=^n24Te?+1w?+q8$-&N&nvq*H9ivKTeCcwWIYF?7QLktp#{QBz*(vwPz%D( zbFrH_wXl(-Hb*a(3KeT6Rv&&ehFp##m|!x}R-%o4I5xCG*o;}jCMlgtr=|xNwX=dl z^XBVtFxik*1S=Q3PsA|D84PAuwEQ7&ED{{rWQTxtA7QT@VKxMW>Emu5lNEO`v0S?I zajvl}b8##;s&1TF@A$xVkDW2=D`9r0B=T!AW#+kr+9}}TYo~xuxZ&T@#Qfwq28>ln zzr4^kKZ!ps+^D0`Siu;Der2C3$$e&o!tVs15H6BP=XY2jMAAm}3KCg>&T zCnysP5$q+{M=(rqJ;4Ej8wd^(93nVEa3jG{f(pS+1S14D6Wl^@E5UJsF@oC&ZYMZJ zFiB7)m?Ah$a0fxeE%`*2`CkVfZJgzM9CxKj%<>gV-GJQbxJ&jt=FWTt`Rx!`#6gkm z@>>GzkY@^|p;A9~O71Ql!A{9MM6$EA7vB`{4!)@qpW3CizDc1ekv5Fo;aAautwP$EOR0xyEUc)(buOG%>fqj2hpk68h9eEj9rxpOHu(xM4DuK=#Ms z=vy~uqaf-|+MN8?n)FC8;PshN<1 tags where other parsers don't.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_xml_declaration_followed_by_doctype(self): + markup = ''' + + + + + +

foo

+ +''' + soup = self.soup(markup) + # Verify that we can reach the

tag; this means the tree is connected. + self.assertEqual(b"

foo

", soup.p.encode()) + + def test_reparented_markup(self): + markup = '

foo

\n

bar

' + soup = self.soup(markup) + self.assertEqual("

foo

\n

bar

", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + + def test_reparented_markup_ends_with_whitespace(self): + markup = '

foo

\n

bar

\n' + soup = self.soup(markup) + self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent tags. + """ + markup = '
' + soup = self.soup(markup) + space1, space2 = soup.find_all(string=' ') + tbody1, tbody2 = soup.find_all('tbody') + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + + def test_reparented_markup_containing_children(self): + markup = '' + soup = self.soup(markup) + noscript = soup.noscript + self.assertEqual("target", noscript.next_element) + target = soup.find(string='target') + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string='aftermath')[-1] + + # The