From 93aea7a22280f03ab1140827f72f9d4a57d17b93 Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Mon, 12 Nov 2018 16:36:23 +0000
Subject: [PATCH 1/6] remove parsel dependency

* _html_to_text is promoted to a public html_text.etree_to_text
* html_text.cleaner object is exposed
* parsel is imported only when needed
* create_root_node implementation is copy-pasted to parse_html,
  to remove dependency
* parsel is removed from install_requiers
* README is updated

The goal is to allow using html_text in parsel.
---
 README.rst              | 68 +++++++++++++++++++++++------------------
 html_text/__init__.py   |  5 +--
 html_text/html_text.py  | 40 +++++++++++++++---------
 setup.py                | 12 ++------
 tests/test_html_text.py |  6 +++-
 tox.ini                 |  1 +
 6 files changed, 75 insertions(+), 57 deletions(-)
diff --git a/README.rst b/README.rst
index 5d3fdb9..b29bb97 100644
--- a/README.rst
+++ b/README.rst
@@ -17,29 +17,19 @@ HTML to Text
 
 Extract text from HTML
 
-
 * Free software: MIT license
 
-
 How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
-Text extracted with ``html_text`` does not contain inline styles,
-javascript, comments and other text that is not normally visible to the users.
-It normalizes whitespace, but is also smarter than
-``.xpath('normalize-space())``, adding spaces around inline elements
-(which are often used as block elements in html markup),
-tries to avoid adding extra spaces for punctuation and
-can add newlines so that the output text looks like how it is rendered in
-browsers.
-
-Apart from just getting text from the page (e.g. for display or search),
-one intended usage of this library is for machine learning (feature extraction).
-If you want to use the text of the html page as a feature (e.g. for classification),
-this library gives you plain text that you can later feed into a standard text
-classification pipeline.
-If you feel that you need html structure as well, check out
-`webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library.
 
+* Text extracted with ``html_text`` does not contain inline styles,
+  javascript, comments and other text that is not normally visible to users;
+* ``html_text`` normalizes whitespace, but in a way smarter than
+  ``.xpath('normalize-space())``, adding spaces around inline elements
+  (which are often used as block elements in html markup), and trying to
+  avoid adding extra spaces for punctuation;
+* ``html-text`` can add newlines (e.g. after headers or paragraphs), so
+  that the output text looks more like how it is rendered in browsers.
 
 Install
 -------
@@ -48,7 +38,7 @@ Install with pip::
 
     pip install html-text
 
-The package depends on lxml, so you might need to install some additional
+The package depends on lxml, so you might need to install additional
 packages: http://lxml.de/installation.html
 
 
@@ -64,16 +54,27 @@ Extract text from HTML::
     >>> html_text.extract_text('<h1>Hello</h1> world!', guess_layout=False)
     'Hello world!'
 
+Passed html is first cleaned from invisible non-text content such
+as styles, and then text is extracted.
 
-
-You can also pass already parsed ``lxml.html.HtmlElement``:
+You can also pass an already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
     >>> tree = html_text.parse_html('<h1>Hello</h1> world!')
     >>> html_text.extract_text(tree)
     'Hello\n\nworld!'
 
-Or define a selector to extract text only from specific elements:
+If you want, you can handle cleaning manually; use lower-level
+``html_text.etree_to_text`` in this case:
+
+    >>> import html_text
+    >>> tree = html_text.parse_html('<h1>Hello<style>.foo{}</style>!</h1>')
+    >>> cleaned_tree = html_text.cleaner.clean_html(tree)
+    >>> html_text.etree_to_text(cleaned_tree)
+    'Hello!'
+
+parsel.Selector objects are also supported; you can define
+a parsel.Selector to extract text only from specific elements:
 
     >>> import html_text
     >>> sel = html_text.cleaned_selector('<h1>Hello</h1> world!')
@@ -81,14 +82,18 @@ Or define a selector to extract text only from specific elements:
     >>> html_text.selector_to_text(subsel)
     'Hello'
 
-Passed html will be first cleaned from invisible non-text content such
-as styles, and then text would be extracted.
-NB Selectors are not cleaned automatically you need to call
+NB parsel.Selector objects are not cleaned automatically, you need to call
 ``html_text.cleaned_selector`` first.
 
-Main functions:
+Main functions and objects:
 
 * ``html_text.extract_text`` accepts html and returns extracted text.
+* ``html_text.etree_to_text`` accepts parsed lxml Element and returns
+  extracted text; it is a lower-level function, cleaning is not handled
+  here.
+* ``html_text.cleaner`` is an ``lxml.html.clean.Cleaner`` instance which
+  can be used with ``html_text.etree_to_text``; its options are tuned for
+  speed and text extraction quality.
 * ``html_text.cleaned_selector`` accepts html as text or as
   ``lxml.html.HtmlElement``, and returns cleaned ``parsel.Selector``.
 * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns
@@ -111,10 +116,13 @@ after ``<div>`` tags:
     ...                        newline_tags=newline_tags)
     'Hello world!'
 
-Credits
--------
-
-The code is extracted from utilities used in several projects, written by Mikhail Korobov.
+Apart from just getting text from the page (e.g. for display or search),
+one intended usage of this library is for machine learning (feature extraction).
+If you want to use the text of the html page as a feature (e.g. for classification),
+this library gives you plain text that you can later feed into a standard text
+classification pipeline.
+If you feel that you need html structure as well, check out
+`webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library.
 
 ----
 
diff --git a/html_text/__init__.py b/html_text/__init__.py
index 4045012..21fa95f 100644
--- a/html_text/__init__.py
+++ b/html_text/__init__.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 __version__ = '0.4.1'
 
-from .html_text import (extract_text, parse_html, cleaned_selector,
-                        selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
+from .html_text import (etree_to_text, extract_text, selector_to_text,
+                        parse_html, cleaned_selector, cleaner,
+                        NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 4e69b12..f604872 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -4,8 +4,6 @@
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
-import parsel
-from parsel.selector import create_root_node
 
 
 NEWLINE_TAGS = frozenset([
@@ -18,7 +16,7 @@
     'p', 'pre', 'title', 'ul'
 ])
 
-_clean_html = Cleaner(
+cleaner = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
     comments=True,
@@ -33,7 +31,7 @@
     annoying_tags=False,
     remove_unknown_tags=False,
     safe_attrs_only=False,
-).clean_html
+)
 
 
 def _cleaned_html_tree(html):
@@ -41,13 +39,19 @@ def _cleaned_html_tree(html):
         tree = html
     else:
         tree = parse_html(html)
-    return _clean_html(tree)
+    return cleaner.clean_html(tree)
 
 
 def parse_html(html):
     """ Create an lxml.html.HtmlElement from a string with html.
+    XXX: mostly copy-pasted from parsel.selector.create_root_node
     """
-    return create_root_node(html, lxml.html.HTMLParser)
+    body = html.strip().replace('\x00', '').encode('utf8') or b'<html/>'
+    parser = lxml.html.HTMLParser(recover=True, encoding='utf8')
+    root = lxml.etree.fromstring(body, parser=parser)
+    if root is None:
+        root = lxml.etree.fromstring(b'<html/>', parser=parser)
+    return root
 
 
 _whitespace = re.compile(r'\s+')
@@ -60,15 +64,18 @@ def _normalize_whitespace(text):
     return _whitespace.sub(' ', text.strip())
 
 
-def _html_to_text(tree,
+def etree_to_text(tree,
                   guess_punct_space=True,
                   guess_layout=True,
                   newline_tags=NEWLINE_TAGS,
                   double_newline_tags=DOUBLE_NEWLINE_TAGS):
     """
-    Convert a cleaned html tree to text.
-    See html_text.extract_text docstring for description of the approach
-    and options.
+    Convert a html tree to text. Tree should be cleaned with
+    ``html_text.html_text.cleaner.clean_html`` before passing to this
+    function.
+
+    See html_text.extract_text docstring for description of the
+    approach and options.
     """
     chunks = []
 
@@ -135,11 +142,12 @@ def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
     See html_text.extract_text docstring for description of the approach
     and options.
     """
+    import parsel
     if isinstance(sel, parsel.SelectorList):
         # if selecting a specific xpath
         text = []
         for s in sel:
-            extracted = _html_to_text(
+            extracted = etree_to_text(
                 s.root,
                 guess_punct_space=guess_punct_space,
                 guess_layout=guess_layout)
@@ -147,15 +155,16 @@ def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
                 text.append(extracted)
         return ' '.join(text)
     else:
-        return _html_to_text(
+        return etree_to_text(
             sel.root,
             guess_punct_space=guess_punct_space,
             guess_layout=guess_layout)
 
 
 def cleaned_selector(html):
-    """ Clean selector.
+    """ Clean parsel.selector.
     """
+    import parsel
     try:
         tree = _cleaned_html_tree(html)
         sel = parsel.Selector(root=tree, type='html')
@@ -183,6 +192,9 @@ def extract_text(html,
 
     html should be a unicode string or an already parsed lxml.html element.
 
+    ``html_text.etree_to_text`` is a lower-level function which only accepts
+    an already parsed lxml.html Element, and is not doing html cleaning itself.
+
     When guess_punct_space is True (default), no extra whitespace is added
     for punctuation. This has a slight (around 10%) performance overhead
     and is just a heuristic.
@@ -198,7 +210,7 @@ def extract_text(html,
     if html is None:
         return ''
     cleaned = _cleaned_html_tree(html)
-    return _html_to_text(
+    return etree_to_text(
         cleaned,
         guess_punct_space=guess_punct_space,
         guess_layout=guess_layout,
diff --git a/setup.py b/setup.py
index 462b680..5f445dc 100755
--- a/setup.py
+++ b/setup.py
@@ -9,14 +9,6 @@
 with open('CHANGES.rst') as history_file:
     history = history_file.read()
 
-requirements = [
-    'lxml',
-    'parsel',
-]
-
-test_requirements = [
-    'pytest',
-]
 
 setup(
     name='html_text',
@@ -28,7 +20,7 @@
     url='https://github.com/TeamHG-Memex/html-text',
     packages=['html_text'],
     include_package_data=True,
-    install_requires=requirements,
+    install_requires=['lxml'],
     license="MIT license",
     zip_safe=False,
     classifiers=[
@@ -43,5 +35,5 @@
         'Programming Language :: Python :: 3.6',
     ],
     test_suite='tests',
-    tests_require=test_requirements
+    tests_require=['pytest'],
 )
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index feee55a..b84fdec 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -6,7 +6,8 @@
 import pytest
 
 from html_text import (extract_text, parse_html, cleaned_selector,
-                       selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
+                       etree_to_text, cleaner, selector_to_text, NEWLINE_TAGS,
+                       DOUBLE_NEWLINE_TAGS)
 
 
 ROOT = os.path.dirname(os.path.abspath(__file__))
@@ -184,3 +185,6 @@ def test_webpages(page, extracted):
         html = html.replace('&nbsp;', ' ')
     expected = _load_file(extracted)
     assert extract_text(html) == expected
+
+    tree = cleaner.clean_html(parse_html(html))
+    assert etree_to_text(tree) == expected
diff --git a/tox.ini b/tox.ini
index 762663c..f057de3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,6 +5,7 @@ envlist = py27,py35,py36
 deps =
     pytest
     pytest-cov
+    parsel
 
 commands =
     pip install -U pip

From 2bfcf2cf1b93bf4717cfe131be7cae1604b5db5f Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Wed, 14 Nov 2018 15:36:55 +0000
Subject: [PATCH 2/6] TST add missing test case

this is to cover all branches in parse_html function
---
 tests/test_html_text.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index b84fdec..2c6d6d5 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -49,6 +49,10 @@ def test_empty(all_options):
     assert extract_text(None, **all_options) == ''
 
 
+def test_comment(all_options):
+    assert extract_text(u"<!-- hello world -->", **all_options) == ''
+
+
 def test_extract_text_from_tree(all_options):
     html = (u'<html><style>.div {}</style>'
             '<body><p>Hello,   world!</body></html>')

From ef979dc782d99300f5c70d82f933cdd05071fac4 Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Sat, 17 Nov 2018 15:52:04 +0500
Subject: [PATCH 3/6] DOC make it more explicit selector_to_text is
 parsel-specific

---
 html_text/html_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index f604872..eaaf078 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -138,7 +138,7 @@ def traverse_text_fragments(tree, context, handle_tail=True):
 
 
 def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
-    """ Convert a cleaned selector to text.
+    """ Convert a cleaned parsel.Selector to text.
     See html_text.extract_text docstring for description of the approach
     and options.
     """

From 1ac734980aa1ea6b47623553b5fe4bf835a74c8f Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Sat, 17 Nov 2018 16:10:40 +0500
Subject: [PATCH 4/6] TST run tests without parsel by default, ann environments
 with parsel

---
 tests/test_html_text.py | 1 +
 tox.ini                 | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 2c6d6d5..8f2ffae 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -101,6 +101,7 @@ def test_bad_punct_whitespace():
 
 
 def test_selectors(all_options):
+    pytest.importorskip("parsel")
     html = (u'<span><span id="extract-me">text<a>more</a>'
             '</span>and more text <a> and some more</a> <a></a> </span>')
     # Selector
diff --git a/tox.ini b/tox.ini
index 28bc4f4..1c8fcb9 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,11 +1,11 @@
 [tox]
-envlist = py27,py35,py36,py37
+envlist = py27,py35,py36,py37,{py2,py3}-parsel
 
 [testenv]
 deps =
     pytest
     pytest-cov
-    parsel
+    {py2,py3}-parsel: parsel
 
 commands =
     pip install -U pip

From 028d2e1bb00f633470a8a559382a654944e3fcc9 Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Sat, 17 Nov 2018 16:18:53 +0500
Subject: [PATCH 5/6] TST enable parsel-specific tests on Travis

---
 .travis.yml | 4 ++++
 tox.ini     | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d30217e..d2467c3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,10 +9,14 @@ matrix:
   include:
     - python: 2.7
       env: TOXENV=py27
+    - python: 2.7
+      env: TOXENV=py27-parsel
     - python: 3.5
       env: TOXENV=py35
     - python: 3.6
       env: TOXENV=py36
+    - python: 3.6
+      env: TOXENV=py36-parsel
     - python: 3.7
       env: TOXENV=py37
       dist: xenial
diff --git a/tox.ini b/tox.ini
index 1c8fcb9..33ebfef 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,11 +1,11 @@
 [tox]
-envlist = py27,py35,py36,py37,{py2,py3}-parsel
+envlist = py27,py35,py36,py37,{py27,py36}-parsel
 
 [testenv]
 deps =
     pytest
     pytest-cov
-    {py2,py3}-parsel: parsel
+    {py27,py36}-parsel: parsel
 
 commands =
     pip install -U pip

From d8fa17a4831cbd43ac1d619a087e26f7e6c14f5a Mon Sep 17 00:00:00 2001
From: Mikhail Korobov <kmike84@gmail.com>
Date: Sat, 17 Nov 2018 16:24:08 +0500
Subject: [PATCH 6/6] disable coverage "project" check

---
 codecov.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 codecov.yml

diff --git a/codecov.yml b/codecov.yml
new file mode 100644
index 0000000..d8aa6b9
--- /dev/null
+++ b/codecov.yml
@@ -0,0 +1,6 @@
+comment:
+  layout: "header, diff, tree"
+
+coverage:
+  status:
+    project: false