TeamHG-Memex · lopuhin · May 29, 2017 · May 26, 2017 · May 26, 2017 · May 29, 2017
diff --git a/README.rst b/README.rst
@@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
+It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
+adding spaces around inline elements too
+(which are often used as block elements in html markup),
+and tries to avoid adding extra spaces for punctuation.
 
 
 Install

diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import re
+
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
@@ -39,10 +41,33 @@ def parse_html(html):
     return lxml.html.fromstring(html.encode('utf8'), parser=parser)
 
 
-def selector_to_text(sel):
+_whitespace = re.compile(r'\s+')
+_has_trailing_whitespace = re.compile(r'\s$').search
+_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
+_has_punct_before = re.compile(r'\($').search
+
+
+def selector_to_text(sel, guess_punct_space=True):
     """ Convert a cleaned selector to text.
+    See html_text.extract_text docstring for description of the approach and options.
     """
-    return sel.xpath('normalize-space()').extract_first('')
+    if guess_punct_space:
+
+        def fragments():
+            prev = None
+            for text in sel.xpath('//text()').extract():
+                if prev is not None and (_has_trailing_whitespace(prev)
+                                         or (not _has_punct_after(text) and
+                                             not _has_punct_before(prev))):
+                    yield ' '
+                yield text
+                prev = text
+
+        return _whitespace.sub(' ', ''.join(fragments()).strip())
+
+    else:
+        fragments = (x.strip() for x in sel.xpath('//text()').extract())
+        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
 
 
 def cleaned_selector(html):
@@ -60,10 +85,18 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, encoding='utf8'):
+def extract_text(html, guess_punct_space=True):
     """
     Convert html to text.
+    Almost the same as normalize-space xpath, but this also
+    adds spaces between inline elements (like <span>) which are
+    often used as block elements in html markup.
+
+    When guess_punct_space is True (default), no extra whitespace is added
+    for punctuation. This has a slight (around 10%) performance overhead
+    and is just a heuristic.
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    return selector_to_text(cleaned_selector(html))
+    sel = cleaned_selector(html)
+    return selector_to_text(sel, guess_punct_space=guess_punct_space)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -1,25 +1,49 @@
 # -*- coding: utf-8 -*-
+import pytest
 
 from html_text import extract_text, parse_html
 
 
-def test_extract_text():
+@pytest.fixture(params=[{'guess_punct_space': True},
+                        {'guess_punct_space': False}])
+def all_options(request):
+    return request.param
+
+
+def test_extract_text(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_declared_encoding():
+def test_declared_encoding(all_options):
     html = (u'<?xml version="1.0" encoding="utf-8" ?>'
             u'<html><style>.div {}</style>'
             u'<body>Hello,   world!</p></body></html>')
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_empty():
-    assert extract_text(u'') == ''
+def test_empty(all_options):
+    assert extract_text(u'', **all_options) == ''
 
 
-def test_extract_text_from_tree():
+def test_extract_text_from_tree(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     tree = parse_html(html)
-    assert extract_text(tree) == u'Hello, world!'
+    assert extract_text(tree, **all_options) == u'Hello, world!'
+
+
+def test_inline_tags_whitespace(all_options):
+    html = u'<span>field</span><span>value  of</span><span></span>'
+    assert extract_text(html, **all_options) == u'field value of'
+
+
+def test_punct_whitespace():
+    html = u'<div><span>field</span>, and more</div>'
+    assert extract_text(html, guess_punct_space=False) == u'field , and more'
+
+
+def test_punct_whitespace_preserved():
+    html = (u'<div><span>по</span><span>ле</span>, and  ,  '
+            u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
+    assert (extract_text(html, guess_punct_space=True) ==
+            u'по ле, and , more ! now a (boo)')