From 93aea7a22280f03ab1140827f72f9d4a57d17b93 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Mon, 12 Nov 2018 16:36:23 +0000 Subject: [PATCH 1/6] remove parsel dependency * _html_to_text is promoted to a public html_text.etree_to_text * html_text.cleaner object is exposed * parsel is imported only when needed * create_root_node implementation is copy-pasted to parse_html, to remove dependency * parsel is removed from install_requiers * README is updated The goal is to allow using html_text in parsel. --- README.rst | 68 +++++++++++++++++++++++------------------ html_text/__init__.py | 5 +-- html_text/html_text.py | 40 +++++++++++++++--------- setup.py | 12 ++------ tests/test_html_text.py | 6 +++- tox.ini | 1 + 6 files changed, 75 insertions(+), 57 deletions(-) diff --git a/README.rst b/README.rst index 5d3fdb9..b29bb97 100644 --- a/README.rst +++ b/README.rst @@ -17,29 +17,19 @@ HTML to Text Extract text from HTML - * Free software: MIT license - How is html_text different from ``.xpath('//text()')`` from LXML or ``.get_text()`` from Beautiful Soup? -Text extracted with ``html_text`` does not contain inline styles, -javascript, comments and other text that is not normally visible to the users. -It normalizes whitespace, but is also smarter than -``.xpath('normalize-space())``, adding spaces around inline elements -(which are often used as block elements in html markup), -tries to avoid adding extra spaces for punctuation and -can add newlines so that the output text looks like how it is rendered in -browsers. - -Apart from just getting text from the page (e.g. for display or search), -one intended usage of this library is for machine learning (feature extraction). -If you want to use the text of the html page as a feature (e.g. for classification), -this library gives you plain text that you can later feed into a standard text -classification pipeline. -If you feel that you need html structure as well, check out -`webstruct `_ library. +* Text extracted with ``html_text`` does not contain inline styles, + javascript, comments and other text that is not normally visible to users; +* ``html_text`` normalizes whitespace, but in a way smarter than + ``.xpath('normalize-space())``, adding spaces around inline elements + (which are often used as block elements in html markup), and trying to + avoid adding extra spaces for punctuation; +* ``html-text`` can add newlines (e.g. after headers or paragraphs), so + that the output text looks more like how it is rendered in browsers. Install ------- @@ -48,7 +38,7 @@ Install with pip:: pip install html-text -The package depends on lxml, so you might need to install some additional +The package depends on lxml, so you might need to install additional packages: http://lxml.de/installation.html @@ -64,16 +54,27 @@ Extract text from HTML:: >>> html_text.extract_text('

Hello

world!', guess_layout=False) 'Hello world!' +Passed html is first cleaned from invisible non-text content such +as styles, and then text is extracted. - -You can also pass already parsed ``lxml.html.HtmlElement``: +You can also pass an already parsed ``lxml.html.HtmlElement``: >>> import html_text >>> tree = html_text.parse_html('

Hello

world!') >>> html_text.extract_text(tree) 'Hello\n\nworld!' -Or define a selector to extract text only from specific elements: +If you want, you can handle cleaning manually; use lower-level +``html_text.etree_to_text`` in this case: + + >>> import html_text + >>> tree = html_text.parse_html('

Hello!

') + >>> cleaned_tree = html_text.cleaner.clean_html(tree) + >>> html_text.etree_to_text(cleaned_tree) + 'Hello!' + +parsel.Selector objects are also supported; you can define +a parsel.Selector to extract text only from specific elements: >>> import html_text >>> sel = html_text.cleaned_selector('

Hello

world!') @@ -81,14 +82,18 @@ Or define a selector to extract text only from specific elements: >>> html_text.selector_to_text(subsel) 'Hello' -Passed html will be first cleaned from invisible non-text content such -as styles, and then text would be extracted. -NB Selectors are not cleaned automatically you need to call +NB parsel.Selector objects are not cleaned automatically, you need to call ``html_text.cleaned_selector`` first. -Main functions: +Main functions and objects: * ``html_text.extract_text`` accepts html and returns extracted text. +* ``html_text.etree_to_text`` accepts parsed lxml Element and returns + extracted text; it is a lower-level function, cleaning is not handled + here. +* ``html_text.cleaner`` is an ``lxml.html.clean.Cleaner`` instance which + can be used with ``html_text.etree_to_text``; its options are tuned for + speed and text extraction quality. * ``html_text.cleaned_selector`` accepts html as text or as ``lxml.html.HtmlElement``, and returns cleaned ``parsel.Selector``. * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns @@ -111,10 +116,13 @@ after ``
`` tags: ... newline_tags=newline_tags) 'Hello world!' -Credits -------- - -The code is extracted from utilities used in several projects, written by Mikhail Korobov. +Apart from just getting text from the page (e.g. for display or search), +one intended usage of this library is for machine learning (feature extraction). +If you want to use the text of the html page as a feature (e.g. for classification), +this library gives you plain text that you can later feed into a standard text +classification pipeline. +If you feel that you need html structure as well, check out +`webstruct `_ library. ---- diff --git a/html_text/__init__.py b/html_text/__init__.py index 4045012..21fa95f 100644 --- a/html_text/__init__.py +++ b/html_text/__init__.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- __version__ = '0.4.1' -from .html_text import (extract_text, parse_html, cleaned_selector, - selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) +from .html_text import (etree_to_text, extract_text, selector_to_text, + parse_html, cleaned_selector, cleaner, + NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) diff --git a/html_text/html_text.py b/html_text/html_text.py index 4e69b12..f604872 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -4,8 +4,6 @@ import lxml import lxml.etree from lxml.html.clean import Cleaner -import parsel -from parsel.selector import create_root_node NEWLINE_TAGS = frozenset([ @@ -18,7 +16,7 @@ 'p', 'pre', 'title', 'ul' ]) -_clean_html = Cleaner( +cleaner = Cleaner( scripts=True, javascript=False, # onclick attributes are fine comments=True, @@ -33,7 +31,7 @@ annoying_tags=False, remove_unknown_tags=False, safe_attrs_only=False, -).clean_html +) def _cleaned_html_tree(html): @@ -41,13 +39,19 @@ def _cleaned_html_tree(html): tree = html else: tree = parse_html(html) - return _clean_html(tree) + return cleaner.clean_html(tree) def parse_html(html): """ Create an lxml.html.HtmlElement from a string with html. + XXX: mostly copy-pasted from parsel.selector.create_root_node """ - return create_root_node(html, lxml.html.HTMLParser) + body = html.strip().replace('\x00', '').encode('utf8') or b'' + parser = lxml.html.HTMLParser(recover=True, encoding='utf8') + root = lxml.etree.fromstring(body, parser=parser) + if root is None: + root = lxml.etree.fromstring(b'', parser=parser) + return root _whitespace = re.compile(r'\s+') @@ -60,15 +64,18 @@ def _normalize_whitespace(text): return _whitespace.sub(' ', text.strip()) -def _html_to_text(tree, +def etree_to_text(tree, guess_punct_space=True, guess_layout=True, newline_tags=NEWLINE_TAGS, double_newline_tags=DOUBLE_NEWLINE_TAGS): """ - Convert a cleaned html tree to text. - See html_text.extract_text docstring for description of the approach - and options. + Convert a html tree to text. Tree should be cleaned with + ``html_text.html_text.cleaner.clean_html`` before passing to this + function. + + See html_text.extract_text docstring for description of the + approach and options. """ chunks = [] @@ -135,11 +142,12 @@ def selector_to_text(sel, guess_punct_space=True, guess_layout=True): See html_text.extract_text docstring for description of the approach and options. """ + import parsel if isinstance(sel, parsel.SelectorList): # if selecting a specific xpath text = [] for s in sel: - extracted = _html_to_text( + extracted = etree_to_text( s.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout) @@ -147,15 +155,16 @@ def selector_to_text(sel, guess_punct_space=True, guess_layout=True): text.append(extracted) return ' '.join(text) else: - return _html_to_text( + return etree_to_text( sel.root, guess_punct_space=guess_punct_space, guess_layout=guess_layout) def cleaned_selector(html): - """ Clean selector. + """ Clean parsel.selector. """ + import parsel try: tree = _cleaned_html_tree(html) sel = parsel.Selector(root=tree, type='html') @@ -183,6 +192,9 @@ def extract_text(html, html should be a unicode string or an already parsed lxml.html element. + ``html_text.etree_to_text`` is a lower-level function which only accepts + an already parsed lxml.html Element, and is not doing html cleaning itself. + When guess_punct_space is True (default), no extra whitespace is added for punctuation. This has a slight (around 10%) performance overhead and is just a heuristic. @@ -198,7 +210,7 @@ def extract_text(html, if html is None: return '' cleaned = _cleaned_html_tree(html) - return _html_to_text( + return etree_to_text( cleaned, guess_punct_space=guess_punct_space, guess_layout=guess_layout, diff --git a/setup.py b/setup.py index 462b680..5f445dc 100755 --- a/setup.py +++ b/setup.py @@ -9,14 +9,6 @@ with open('CHANGES.rst') as history_file: history = history_file.read() -requirements = [ - 'lxml', - 'parsel', -] - -test_requirements = [ - 'pytest', -] setup( name='html_text', @@ -28,7 +20,7 @@ url='https://github.com/TeamHG-Memex/html-text', packages=['html_text'], include_package_data=True, - install_requires=requirements, + install_requires=['lxml'], license="MIT license", zip_safe=False, classifiers=[ @@ -43,5 +35,5 @@ 'Programming Language :: Python :: 3.6', ], test_suite='tests', - tests_require=test_requirements + tests_require=['pytest'], ) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index feee55a..b84fdec 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -6,7 +6,8 @@ import pytest from html_text import (extract_text, parse_html, cleaned_selector, - selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) + etree_to_text, cleaner, selector_to_text, NEWLINE_TAGS, + DOUBLE_NEWLINE_TAGS) ROOT = os.path.dirname(os.path.abspath(__file__)) @@ -184,3 +185,6 @@ def test_webpages(page, extracted): html = html.replace(' ', ' ') expected = _load_file(extracted) assert extract_text(html) == expected + + tree = cleaner.clean_html(parse_html(html)) + assert etree_to_text(tree) == expected diff --git a/tox.ini b/tox.ini index 762663c..f057de3 100644 --- a/tox.ini +++ b/tox.ini @@ -5,6 +5,7 @@ envlist = py27,py35,py36 deps = pytest pytest-cov + parsel commands = pip install -U pip From 2bfcf2cf1b93bf4717cfe131be7cae1604b5db5f Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 14 Nov 2018 15:36:55 +0000 Subject: [PATCH 2/6] TST add missing test case this is to cover all branches in parse_html function --- tests/test_html_text.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index b84fdec..2c6d6d5 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -49,6 +49,10 @@ def test_empty(all_options): assert extract_text(None, **all_options) == '' +def test_comment(all_options): + assert extract_text(u"", **all_options) == '' + + def test_extract_text_from_tree(all_options): html = (u'' '

Hello, world!') From ef979dc782d99300f5c70d82f933cdd05071fac4 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sat, 17 Nov 2018 15:52:04 +0500 Subject: [PATCH 3/6] DOC make it more explicit selector_to_text is parsel-specific --- html_text/html_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index f604872..eaaf078 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -138,7 +138,7 @@ def traverse_text_fragments(tree, context, handle_tail=True): def selector_to_text(sel, guess_punct_space=True, guess_layout=True): - """ Convert a cleaned selector to text. + """ Convert a cleaned parsel.Selector to text. See html_text.extract_text docstring for description of the approach and options. """ From 1ac734980aa1ea6b47623553b5fe4bf835a74c8f Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sat, 17 Nov 2018 16:10:40 +0500 Subject: [PATCH 4/6] TST run tests without parsel by default, ann environments with parsel --- tests/test_html_text.py | 1 + tox.ini | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 2c6d6d5..8f2ffae 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -101,6 +101,7 @@ def test_bad_punct_whitespace(): def test_selectors(all_options): + pytest.importorskip("parsel") html = (u'textmore' 'and more text and some more ') # Selector diff --git a/tox.ini b/tox.ini index 28bc4f4..1c8fcb9 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,11 @@ [tox] -envlist = py27,py35,py36,py37 +envlist = py27,py35,py36,py37,{py2,py3}-parsel [testenv] deps = pytest pytest-cov - parsel + {py2,py3}-parsel: parsel commands = pip install -U pip From 028d2e1bb00f633470a8a559382a654944e3fcc9 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sat, 17 Nov 2018 16:18:53 +0500 Subject: [PATCH 5/6] TST enable parsel-specific tests on Travis --- .travis.yml | 4 ++++ tox.ini | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index d30217e..d2467c3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,10 +9,14 @@ matrix: include: - python: 2.7 env: TOXENV=py27 + - python: 2.7 + env: TOXENV=py27-parsel - python: 3.5 env: TOXENV=py35 - python: 3.6 env: TOXENV=py36 + - python: 3.6 + env: TOXENV=py36-parsel - python: 3.7 env: TOXENV=py37 dist: xenial diff --git a/tox.ini b/tox.ini index 1c8fcb9..33ebfef 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,11 @@ [tox] -envlist = py27,py35,py36,py37,{py2,py3}-parsel +envlist = py27,py35,py36,py37,{py27,py36}-parsel [testenv] deps = pytest pytest-cov - {py2,py3}-parsel: parsel + {py27,py36}-parsel: parsel commands = pip install -U pip From d8fa17a4831cbd43ac1d619a087e26f7e6c14f5a Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Sat, 17 Nov 2018 16:24:08 +0500 Subject: [PATCH 6/6] disable coverage "project" check --- codecov.yml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 codecov.yml diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..d8aa6b9 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,6 @@ +comment: + layout: "header, diff, tree" + +coverage: + status: + project: false