diff --git a/README.rst b/README.rst
index 5d3fdb9..b29bb97 100644
--- a/README.rst
+++ b/README.rst
@@ -17,29 +17,19 @@ HTML to Text
Extract text from HTML
-
* Free software: MIT license
-
How is html_text different from ``.xpath('//text()')`` from LXML
or ``.get_text()`` from Beautiful Soup?
-Text extracted with ``html_text`` does not contain inline styles,
-javascript, comments and other text that is not normally visible to the users.
-It normalizes whitespace, but is also smarter than
-``.xpath('normalize-space())``, adding spaces around inline elements
-(which are often used as block elements in html markup),
-tries to avoid adding extra spaces for punctuation and
-can add newlines so that the output text looks like how it is rendered in
-browsers.
-
-Apart from just getting text from the page (e.g. for display or search),
-one intended usage of this library is for machine learning (feature extraction).
-If you want to use the text of the html page as a feature (e.g. for classification),
-this library gives you plain text that you can later feed into a standard text
-classification pipeline.
-If you feel that you need html structure as well, check out
-`webstruct `_ library.
+* Text extracted with ``html_text`` does not contain inline styles,
+ javascript, comments and other text that is not normally visible to users;
+* ``html_text`` normalizes whitespace, but in a way smarter than
+ ``.xpath('normalize-space())``, adding spaces around inline elements
+ (which are often used as block elements in html markup), and trying to
+ avoid adding extra spaces for punctuation;
+* ``html-text`` can add newlines (e.g. after headers or paragraphs), so
+ that the output text looks more like how it is rendered in browsers.
Install
-------
@@ -48,7 +38,7 @@ Install with pip::
pip install html-text
-The package depends on lxml, so you might need to install some additional
+The package depends on lxml, so you might need to install additional
packages: http://lxml.de/installation.html
@@ -64,16 +54,27 @@ Extract text from HTML::
>>> html_text.extract_text('
Hello
world!', guess_layout=False)
'Hello world!'
+Passed html is first cleaned from invisible non-text content such
+as styles, and then text is extracted.
-
-You can also pass already parsed ``lxml.html.HtmlElement``:
+You can also pass an already parsed ``lxml.html.HtmlElement``:
>>> import html_text
>>> tree = html_text.parse_html('Hello
world!')
>>> html_text.extract_text(tree)
'Hello\n\nworld!'
-Or define a selector to extract text only from specific elements:
+If you want, you can handle cleaning manually; use lower-level
+``html_text.etree_to_text`` in this case:
+
+ >>> import html_text
+ >>> tree = html_text.parse_html('Hello!
')
+ >>> cleaned_tree = html_text.cleaner.clean_html(tree)
+ >>> html_text.etree_to_text(cleaned_tree)
+ 'Hello!'
+
+parsel.Selector objects are also supported; you can define
+a parsel.Selector to extract text only from specific elements:
>>> import html_text
>>> sel = html_text.cleaned_selector('Hello
world!')
@@ -81,14 +82,18 @@ Or define a selector to extract text only from specific elements:
>>> html_text.selector_to_text(subsel)
'Hello'
-Passed html will be first cleaned from invisible non-text content such
-as styles, and then text would be extracted.
-NB Selectors are not cleaned automatically you need to call
+NB parsel.Selector objects are not cleaned automatically, you need to call
``html_text.cleaned_selector`` first.
-Main functions:
+Main functions and objects:
* ``html_text.extract_text`` accepts html and returns extracted text.
+* ``html_text.etree_to_text`` accepts parsed lxml Element and returns
+ extracted text; it is a lower-level function, cleaning is not handled
+ here.
+* ``html_text.cleaner`` is an ``lxml.html.clean.Cleaner`` instance which
+ can be used with ``html_text.etree_to_text``; its options are tuned for
+ speed and text extraction quality.
* ``html_text.cleaned_selector`` accepts html as text or as
``lxml.html.HtmlElement``, and returns cleaned ``parsel.Selector``.
* ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns
@@ -111,10 +116,13 @@ after ```` tags:
... newline_tags=newline_tags)
'Hello world!'
-Credits
--------
-
-The code is extracted from utilities used in several projects, written by Mikhail Korobov.
+Apart from just getting text from the page (e.g. for display or search),
+one intended usage of this library is for machine learning (feature extraction).
+If you want to use the text of the html page as a feature (e.g. for classification),
+this library gives you plain text that you can later feed into a standard text
+classification pipeline.
+If you feel that you need html structure as well, check out
+`webstruct `_ library.
----
diff --git a/html_text/__init__.py b/html_text/__init__.py
index 4045012..21fa95f 100644
--- a/html_text/__init__.py
+++ b/html_text/__init__.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
__version__ = '0.4.1'
-from .html_text import (extract_text, parse_html, cleaned_selector,
- selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
+from .html_text import (etree_to_text, extract_text, selector_to_text,
+ parse_html, cleaned_selector, cleaner,
+ NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 4e69b12..f604872 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -4,8 +4,6 @@
import lxml
import lxml.etree
from lxml.html.clean import Cleaner
-import parsel
-from parsel.selector import create_root_node
NEWLINE_TAGS = frozenset([
@@ -18,7 +16,7 @@
'p', 'pre', 'title', 'ul'
])
-_clean_html = Cleaner(
+cleaner = Cleaner(
scripts=True,
javascript=False, # onclick attributes are fine
comments=True,
@@ -33,7 +31,7 @@
annoying_tags=False,
remove_unknown_tags=False,
safe_attrs_only=False,
-).clean_html
+)
def _cleaned_html_tree(html):
@@ -41,13 +39,19 @@ def _cleaned_html_tree(html):
tree = html
else:
tree = parse_html(html)
- return _clean_html(tree)
+ return cleaner.clean_html(tree)
def parse_html(html):
""" Create an lxml.html.HtmlElement from a string with html.
+ XXX: mostly copy-pasted from parsel.selector.create_root_node
"""
- return create_root_node(html, lxml.html.HTMLParser)
+ body = html.strip().replace('\x00', '').encode('utf8') or b'
'
+ parser = lxml.html.HTMLParser(recover=True, encoding='utf8')
+ root = lxml.etree.fromstring(body, parser=parser)
+ if root is None:
+ root = lxml.etree.fromstring(b'