Skip to content

Commit

Permalink
Make guess_punct_space=True by default, document
Browse files Browse the repository at this point in the history
  • Loading branch information
lopuhin committed May 29, 2017
1 parent e9cf9b8 commit 1fb2ec4
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 6 deletions.
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
or ``.get_text()`` from Beautiful Soup?
Text extracted with ``html_text`` does not contain inline styles,
javascript, comments and other text that is not normally visible to the users.
It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
adding spaces around inline elements too
(which are often used as block elements in html markup),
and tries to avoid adding extra spaces for punctuation.


Install
Expand Down
15 changes: 10 additions & 5 deletions html_text/html_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,9 @@ def parse_html(html):
_has_punct_before = re.compile(r'\($').search


def selector_to_text(sel, guess_punct_space=False):
def selector_to_text(sel, guess_punct_space=True):
""" Convert a cleaned selector to text.
Almost the same as xpath normalize-space, but this also
adds spaces between inline elements (like <span>) which are
often used as block elements in html markup.
See html_text.extract_text docstring for description of the approach and options.
"""
if guess_punct_space:

Expand Down Expand Up @@ -87,9 +85,16 @@ def cleaned_selector(html):
return sel


def extract_text(html, guess_punct_space=False):
def extract_text(html, guess_punct_space=True):
"""
Convert html to text.
Almost the same as normalize-space xpath, but this also
adds spaces between inline elements (like <span>) which are
often used as block elements in html markup.
When guess_punct_space is True (default), no extra whitespace is added
for punctuation. This has a slight (around 10%) performance overhead
and is just a heuristic.
html should be a unicode string or an already parsed lxml.html element.
"""
Expand Down
2 changes: 1 addition & 1 deletion tests/test_html_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_inline_tags_whitespace(all_options):

def test_punct_whitespace():
html = u'<div><span>field</span>, and more</div>'
assert extract_text(html) == u'field , and more'
assert extract_text(html, guess_punct_space=False) == u'field , and more'


def test_punct_whitespace_preserved():
Expand Down

0 comments on commit 1fb2ec4

Please sign in to comment.