Make guess_punct_space=True by default, document

TeamHG-Memex · May 29, 2017 · 1fb2ec4 · 1fb2ec4
1 parent e9cf9b8
commit 1fb2ec4
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 6 deletions.
diff --git a/README.rst b/README.rst
@@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
+It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
+adding spaces around inline elements too
+(which are often used as block elements in html markup),
+and tries to avoid adding extra spaces for punctuation.
 
 
 Install

diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -47,11 +47,9 @@ def parse_html(html):
 _has_punct_before = re.compile(r'\($').search
 
 
-def selector_to_text(sel, guess_punct_space=False):
+def selector_to_text(sel, guess_punct_space=True):
     """ Convert a cleaned selector to text.
-    Almost the same as xpath normalize-space, but this also
-    adds spaces between inline elements (like <span>) which are
-    often used as block elements in html markup.
+    See html_text.extract_text docstring for description of the approach and options.
     """
     if guess_punct_space:
 
@@ -87,9 +85,16 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, guess_punct_space=False):
+def extract_text(html, guess_punct_space=True):
     """
     Convert html to text.
+    Almost the same as normalize-space xpath, but this also
+    adds spaces between inline elements (like <span>) which are
+    often used as block elements in html markup.
+
+    When guess_punct_space is True (default), no extra whitespace is added
+    for punctuation. This has a slight (around 10%) performance overhead
+    and is just a heuristic.
 
     html should be a unicode string or an already parsed lxml.html element.
     """

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -39,7 +39,7 @@ def test_inline_tags_whitespace(all_options):
 
 def test_punct_whitespace():
     html = u'<div><span>field</span>, and more</div>'
-    assert extract_text(html) == u'field , and more'
+    assert extract_text(html, guess_punct_space=False) == u'field , and more'
 
 
 def test_punct_whitespace_preserved():