Merge pull request #2 from TeamHG-Memex/inline-tags-spaces

Fix unwanted joins for inline tags
TeamHG-Memex · May 29, 2017 · cf48523 · cf48523
2 parents c7ebb57 + 1fb2ec4
commit cf48523
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 12 deletions.
diff --git a/README.rst b/README.rst
@@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
+It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
+adding spaces around inline elements too
+(which are often used as block elements in html markup),
+and tries to avoid adding extra spaces for punctuation.
 
 Apart from just getting text from the page (e.g. for display or search),
 one intended usage of this library is for machine learning (feature extraction).

diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+import re
+
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
@@ -39,10 +41,33 @@ def parse_html(html):
     return lxml.html.fromstring(html.encode('utf8'), parser=parser)
 
 
-def selector_to_text(sel):
+_whitespace = re.compile(r'\s+')
+_has_trailing_whitespace = re.compile(r'\s$').search
+_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
+_has_punct_before = re.compile(r'\($').search
+
+
+def selector_to_text(sel, guess_punct_space=True):
     """ Convert a cleaned selector to text.
+    See html_text.extract_text docstring for description of the approach and options.
     """
-    return sel.xpath('normalize-space()').extract_first('')
+    if guess_punct_space:
+
+        def fragments():
+            prev = None
+            for text in sel.xpath('//text()').extract():
+                if prev is not None and (_has_trailing_whitespace(prev)
+                                         or (not _has_punct_after(text) and
+                                             not _has_punct_before(prev))):
+                    yield ' '
+                yield text
+                prev = text
+
+        return _whitespace.sub(' ', ''.join(fragments()).strip())
+
+    else:
+        fragments = (x.strip() for x in sel.xpath('//text()').extract())
+        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
 
 
 def cleaned_selector(html):
@@ -60,10 +85,18 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, encoding='utf8'):
+def extract_text(html, guess_punct_space=True):
     """
     Convert html to text.
+    Almost the same as normalize-space xpath, but this also
+    adds spaces between inline elements (like <span>) which are
+    often used as block elements in html markup.
+
+    When guess_punct_space is True (default), no extra whitespace is added
+    for punctuation. This has a slight (around 10%) performance overhead
+    and is just a heuristic.
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    return selector_to_text(cleaned_selector(html))
+    sel = cleaned_selector(html)
+    return selector_to_text(sel, guess_punct_space=guess_punct_space)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -1,25 +1,49 @@
 # -*- coding: utf-8 -*-
+import pytest
 
 from html_text import extract_text, parse_html
 
 
-def test_extract_text():
+@pytest.fixture(params=[{'guess_punct_space': True},
+                        {'guess_punct_space': False}])
+def all_options(request):
+    return request.param
+
+
+def test_extract_text(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_declared_encoding():
+def test_declared_encoding(all_options):
     html = (u'<?xml version="1.0" encoding="utf-8" ?>'
             u'<html><style>.div {}</style>'
             u'<body>Hello,   world!</p></body></html>')
-    assert extract_text(html) == u'Hello, world!'
+    assert extract_text(html, **all_options) == u'Hello, world!'
 
 
-def test_empty():
-    assert extract_text(u'') == ''
+def test_empty(all_options):
+    assert extract_text(u'', **all_options) == ''
 
 
-def test_extract_text_from_tree():
+def test_extract_text_from_tree(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     tree = parse_html(html)
-    assert extract_text(tree) == u'Hello, world!'
+    assert extract_text(tree, **all_options) == u'Hello, world!'
+
+
+def test_inline_tags_whitespace(all_options):
+    html = u'<span>field</span><span>value  of</span><span></span>'
+    assert extract_text(html, **all_options) == u'field value of'
+
+
+def test_punct_whitespace():
+    html = u'<div><span>field</span>, and more</div>'
+    assert extract_text(html, guess_punct_space=False) == u'field , and more'
+
+
+def test_punct_whitespace_preserved():
+    html = (u'<div><span>по</span><span>ле</span>, and  ,  '
+            u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
+    assert (extract_text(html, guess_punct_space=True) ==
+            u'по ле, and , more ! now a (boo)')