Merge pull request #15 from TeamHG-Memex/remove-parsel-dependency

Remove parsel dependency
TeamHG-Memex · Nov 19, 2018 · 80289f1 · 80289f1
2 parents b5cd26a + d8fa17a
commit 80289f1
Show file tree

Hide file tree

Showing 8 changed files with 92 additions and 59 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,10 +9,14 @@ matrix:
   include:
     - python: 2.7
       env: TOXENV=py27
+    - python: 2.7
+      env: TOXENV=py27-parsel
     - python: 3.5
       env: TOXENV=py35
     - python: 3.6
       env: TOXENV=py36
+    - python: 3.6
+      env: TOXENV=py36-parsel
     - python: 3.7
       env: TOXENV=py37
       dist: xenial

diff --git a/README.rst b/README.rst
@@ -17,29 +17,19 @@ HTML to Text
 
 Extract text from HTML
 
-
 * Free software: MIT license
 
-
 How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
-Text extracted with ``html_text`` does not contain inline styles,
-javascript, comments and other text that is not normally visible to the users.
-It normalizes whitespace, but is also smarter than
-``.xpath('normalize-space())``, adding spaces around inline elements
-(which are often used as block elements in html markup),
-tries to avoid adding extra spaces for punctuation and
-can add newlines so that the output text looks like how it is rendered in
-browsers.
-
-Apart from just getting text from the page (e.g. for display or search),
-one intended usage of this library is for machine learning (feature extraction).
-If you want to use the text of the html page as a feature (e.g. for classification),
-this library gives you plain text that you can later feed into a standard text
-classification pipeline.
-If you feel that you need html structure as well, check out
-`webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library.
 
+* Text extracted with ``html_text`` does not contain inline styles,
+  javascript, comments and other text that is not normally visible to users;
+* ``html_text`` normalizes whitespace, but in a way smarter than
+  ``.xpath('normalize-space())``, adding spaces around inline elements
+  (which are often used as block elements in html markup), and trying to
+  avoid adding extra spaces for punctuation;
+* ``html-text`` can add newlines (e.g. after headers or paragraphs), so
+  that the output text looks more like how it is rendered in browsers.
 
 Install
 -------
@@ -48,7 +38,7 @@ Install with pip::
 
     pip install html-text
 
-The package depends on lxml, so you might need to install some additional
+The package depends on lxml, so you might need to install additional
 packages: http://lxml.de/installation.html
 
 
@@ -64,31 +54,46 @@ Extract text from HTML::
     >>> html_text.extract_text('<h1>Hello</h1> world!', guess_layout=False)
     'Hello world!'
 
+Passed html is first cleaned from invisible non-text content such
+as styles, and then text is extracted.
 
-
-You can also pass already parsed ``lxml.html.HtmlElement``:
+You can also pass an already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
     >>> tree = html_text.parse_html('<h1>Hello</h1> world!')
     >>> html_text.extract_text(tree)
     'Hello\n\nworld!'
 
-Or define a selector to extract text only from specific elements:
+If you want, you can handle cleaning manually; use lower-level
+``html_text.etree_to_text`` in this case:
+
+    >>> import html_text
+    >>> tree = html_text.parse_html('<h1>Hello<style>.foo{}</style>!</h1>')
+    >>> cleaned_tree = html_text.cleaner.clean_html(tree)
+    >>> html_text.etree_to_text(cleaned_tree)
+    'Hello!'
+
+parsel.Selector objects are also supported; you can define
+a parsel.Selector to extract text only from specific elements:
 
     >>> import html_text
     >>> sel = html_text.cleaned_selector('<h1>Hello</h1> world!')
     >>> subsel = sel.xpath('//h1')
     >>> html_text.selector_to_text(subsel)
     'Hello'
 
-Passed html will be first cleaned from invisible non-text content such
-as styles, and then text would be extracted.
-NB Selectors are not cleaned automatically you need to call
+NB parsel.Selector objects are not cleaned automatically, you need to call
 ``html_text.cleaned_selector`` first.
 
-Main functions:
+Main functions and objects:
 
 * ``html_text.extract_text`` accepts html and returns extracted text.
+* ``html_text.etree_to_text`` accepts parsed lxml Element and returns
+  extracted text; it is a lower-level function, cleaning is not handled
+  here.
+* ``html_text.cleaner`` is an ``lxml.html.clean.Cleaner`` instance which
+  can be used with ``html_text.etree_to_text``; its options are tuned for
+  speed and text extraction quality.
 * ``html_text.cleaned_selector`` accepts html as text or as
   ``lxml.html.HtmlElement``, and returns cleaned ``parsel.Selector``.
 * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns
@@ -111,10 +116,13 @@ after ``<div>`` tags:
     ...                        newline_tags=newline_tags)
     'Hello world!'
 
-Credits
--------
-
-The code is extracted from utilities used in several projects, written by Mikhail Korobov.
+Apart from just getting text from the page (e.g. for display or search),
+one intended usage of this library is for machine learning (feature extraction).
+If you want to use the text of the html page as a feature (e.g. for classification),
+this library gives you plain text that you can later feed into a standard text
+classification pipeline.
+If you feel that you need html structure as well, check out
+`webstruct <http://webstruct.readthedocs.io/en/latest/>`_ library.
 
 ----
 

diff --git a/codecov.yml b/codecov.yml
@@ -0,0 +1,6 @@
+comment:
+  layout: "header, diff, tree"
+
+coverage:
+  status:
+    project: false
diff --git a/html_text/__init__.py b/html_text/__init__.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 __version__ = '0.4.1'
 
-from .html_text import (extract_text, parse_html, cleaned_selector,
-                        selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
+from .html_text import (etree_to_text, extract_text, selector_to_text,
+                        parse_html, cleaned_selector, cleaner,
+                        NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -4,8 +4,6 @@
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
-import parsel
-from parsel.selector import create_root_node
 
 
 NEWLINE_TAGS = frozenset([
@@ -18,7 +16,7 @@
     'p', 'pre', 'title', 'ul'
 ])
 
-_clean_html = Cleaner(
+cleaner = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
     comments=True,
@@ -33,21 +31,27 @@
     annoying_tags=False,
     remove_unknown_tags=False,
     safe_attrs_only=False,
-).clean_html
+)
 
 
 def _cleaned_html_tree(html):
     if isinstance(html, lxml.html.HtmlElement):
         tree = html
     else:
         tree = parse_html(html)
-    return _clean_html(tree)
+    return cleaner.clean_html(tree)
 
 
 def parse_html(html):
     """ Create an lxml.html.HtmlElement from a string with html.
+    XXX: mostly copy-pasted from parsel.selector.create_root_node
     """
-    return create_root_node(html, lxml.html.HTMLParser)
+    body = html.strip().replace('\x00', '').encode('utf8') or b'<html/>'
+    parser = lxml.html.HTMLParser(recover=True, encoding='utf8')
+    root = lxml.etree.fromstring(body, parser=parser)
+    if root is None:
+        root = lxml.etree.fromstring(b'<html/>', parser=parser)
+    return root
 
 
 _whitespace = re.compile(r'\s+')
@@ -60,15 +64,18 @@ def _normalize_whitespace(text):
     return _whitespace.sub(' ', text.strip())
 
 
-def _html_to_text(tree,
+def etree_to_text(tree,
                   guess_punct_space=True,
                   guess_layout=True,
                   newline_tags=NEWLINE_TAGS,
                   double_newline_tags=DOUBLE_NEWLINE_TAGS):
     """
-    Convert a cleaned html tree to text.
-    See html_text.extract_text docstring for description of the approach
-    and options.
+    Convert a html tree to text. Tree should be cleaned with
+    ``html_text.html_text.cleaner.clean_html`` before passing to this
+    function.
+
+    See html_text.extract_text docstring for description of the
+    approach and options.
     """
     chunks = []
 
@@ -131,31 +138,33 @@ def traverse_text_fragments(tree, context, handle_tail=True):
 
 
 def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
-    """ Convert a cleaned selector to text.
+    """ Convert a cleaned parsel.Selector to text.
     See html_text.extract_text docstring for description of the approach
     and options.
     """
+    import parsel
     if isinstance(sel, parsel.SelectorList):
         # if selecting a specific xpath
         text = []
         for s in sel:
-            extracted = _html_to_text(
+            extracted = etree_to_text(
                 s.root,
                 guess_punct_space=guess_punct_space,
                 guess_layout=guess_layout)
             if extracted:
                 text.append(extracted)
         return ' '.join(text)
     else:
-        return _html_to_text(
+        return etree_to_text(
             sel.root,
             guess_punct_space=guess_punct_space,
             guess_layout=guess_layout)
 
 
 def cleaned_selector(html):
-    """ Clean selector.
+    """ Clean parsel.selector.
     """
+    import parsel
     try:
         tree = _cleaned_html_tree(html)
         sel = parsel.Selector(root=tree, type='html')
@@ -183,6 +192,9 @@ def extract_text(html,
 
     html should be a unicode string or an already parsed lxml.html element.
 
+    ``html_text.etree_to_text`` is a lower-level function which only accepts
+    an already parsed lxml.html Element, and is not doing html cleaning itself.
+
     When guess_punct_space is True (default), no extra whitespace is added
     for punctuation. This has a slight (around 10%) performance overhead
     and is just a heuristic.
@@ -198,7 +210,7 @@ def extract_text(html,
     if html is None:
         return ''
     cleaned = _cleaned_html_tree(html)
-    return _html_to_text(
+    return etree_to_text(
         cleaned,
         guess_punct_space=guess_punct_space,
         guess_layout=guess_layout,

diff --git a/setup.py b/setup.py
@@ -9,14 +9,6 @@
 with open('CHANGES.rst') as history_file:
     history = history_file.read()
 
-requirements = [
-    'lxml',
-    'parsel',
-]
-
-test_requirements = [
-    'pytest',
-]
 
 setup(
     name='html_text',
@@ -28,7 +20,7 @@
     url='https://github.com/TeamHG-Memex/html-text',
     packages=['html_text'],
     include_package_data=True,
-    install_requires=requirements,
+    install_requires=['lxml'],
     license="MIT license",
     zip_safe=False,
     classifiers=[
@@ -44,5 +36,5 @@
         'Programming Language :: Python :: 3.7',
     ],
     test_suite='tests',
-    tests_require=test_requirements
+    tests_require=['pytest'],
 )
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -6,7 +6,8 @@
 import pytest
 
 from html_text import (extract_text, parse_html, cleaned_selector,
-                       selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
+                       etree_to_text, cleaner, selector_to_text, NEWLINE_TAGS,
+                       DOUBLE_NEWLINE_TAGS)
 
 
 ROOT = os.path.dirname(os.path.abspath(__file__))
@@ -48,6 +49,10 @@ def test_empty(all_options):
     assert extract_text(None, **all_options) == ''
 
 
+def test_comment(all_options):
+    assert extract_text(u"<!-- hello world -->", **all_options) == ''
+
+
 def test_extract_text_from_tree(all_options):
     html = (u'<html><style>.div {}</style>'
             '<body><p>Hello,   world!</body></html>')
@@ -96,6 +101,7 @@ def test_bad_punct_whitespace():
 
 
 def test_selectors(all_options):
+    pytest.importorskip("parsel")
     html = (u'<span><span id="extract-me">text<a>more</a>'
             '</span>and more text <a> and some more</a> <a></a> </span>')
     # Selector
@@ -184,3 +190,6 @@ def test_webpages(page, extracted):
         html = html.replace('&nbsp;', ' ')
     expected = _load_file(extracted)
     assert extract_text(html) == expected
+
+    tree = cleaner.clean_html(parse_html(html))
+    assert etree_to_text(tree) == expected
diff --git a/tox.ini b/tox.ini
@@ -1,10 +1,11 @@
 [tox]
-envlist = py27,py35,py36,py37
+envlist = py27,py35,py36,py37,{py27,py36}-parsel
 
 [testenv]
 deps =
     pytest
     pytest-cov
+    {py27,py36}-parsel: parsel
 
 commands =
     pip install -U pip