diff --git a/html_text/html_text.py b/html_text/html_text.py
index 2b61a9a..544dc49 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -41,18 +41,36 @@ def parse_html(html):
return lxml.html.fromstring(html.encode('utf8'), parser=parser)
-_whitespace = re.compile('\s+')
+_whitespace = re.compile(r'\s+')
+_trailing_whitespace = re.compile(r'\s$')
+_punct_after = re.compile(r'[,:;.!?"\)]')
+_punct_before = re.compile(r'[\(]')
-def selector_to_text(sel):
+def selector_to_text(sel, guess_punct_space=False):
""" Convert a cleaned selector to text.
Almost the same as xpath normalize-space, but this also
adds spaces between inline elements (like ) which are
often used as block elements in html markup.
"""
- fragments = (_whitespace.sub(' ', x.strip())
- for x in sel.xpath('//text()').extract())
- return ' '.join(x for x in fragments if x)
+ if guess_punct_space:
+
+ def fragments():
+ prev = None
+ for text in sel.xpath('//text()').extract():
+ if prev is not None and (_trailing_whitespace.search(prev)
+ or (not _punct_after.match(text) and
+ not _punct_before.match(prev))):
+ yield ' '
+ yield text
+ prev = text
+
+ return _whitespace.sub(' ', ''.join(fragments()).strip())
+
+ else:
+ fragments = (_whitespace.sub(' ', x.strip())
+ for x in sel.xpath('//text()').extract())
+ return ' '.join(x for x in fragments if x)
def cleaned_selector(html):
@@ -70,10 +88,11 @@ def cleaned_selector(html):
return sel
-def extract_text(html, encoding='utf8'):
+def extract_text(html, guess_punct_space=False):
"""
Convert html to text.
html should be a unicode string or an already parsed lxml.html element.
"""
- return selector_to_text(cleaned_selector(html))
+ sel = cleaned_selector(html)
+ return selector_to_text(sel, guess_punct_space=guess_punct_space)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index b5078e3..a441d00 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,30 +1,49 @@
# -*- coding: utf-8 -*-
+import pytest
from html_text import extract_text, parse_html
-def test_extract_text():
+@pytest.fixture(params=[{'guess_punct_space': True},
+ {'guess_punct_space': False}])
+def all_options(request):
+ return request.param
+
+
+def test_extract_text(all_options):
html = u' Hello, world!'
- assert extract_text(html) == u'Hello, world!'
+ assert extract_text(html, **all_options) == u'Hello, world!'
-def test_declared_encoding():
+def test_declared_encoding(all_options):
html = (u''
u''
u'
Hello, world!' tree = parse_html(html) - assert extract_text(tree) == u'Hello, world!' + assert extract_text(tree, **all_options) == u'Hello, world!' + + +def test_inline_tags_whitespace(all_options): + html = u'fieldvalue of' + assert extract_text(html, **all_options) == u'field value of' + + +def test_punct_whitespace(): + html = u'