diff --git a/README.rst b/README.rst index 4c24d79..19e14d1 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML or ``.get_text()`` from Beautiful Soup? Text extracted with ``html_text`` does not contain inline styles, javascript, comments and other text that is not normally visible to the users. +It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``, +adding spaces around inline elements too +(which are often used as block elements in html markup), +and tries to avoid adding extra spaces for punctuation. Install diff --git a/html_text/html_text.py b/html_text/html_text.py index 465ebad..56d220e 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -47,11 +47,9 @@ def parse_html(html): _has_punct_before = re.compile(r'\($').search -def selector_to_text(sel, guess_punct_space=False): +def selector_to_text(sel, guess_punct_space=True): """ Convert a cleaned selector to text. - Almost the same as xpath normalize-space, but this also - adds spaces between inline elements (like ) which are - often used as block elements in html markup. + See html_text.extract_text docstring for description of the approach and options. """ if guess_punct_space: @@ -87,9 +85,16 @@ def cleaned_selector(html): return sel -def extract_text(html, guess_punct_space=False): +def extract_text(html, guess_punct_space=True): """ Convert html to text. + Almost the same as normalize-space xpath, but this also + adds spaces between inline elements (like ) which are + often used as block elements in html markup. + + When guess_punct_space is True (default), no extra whitespace is added + for punctuation. This has a slight (around 10%) performance overhead + and is just a heuristic. html should be a unicode string or an already parsed lxml.html element. """ diff --git a/tests/test_html_text.py b/tests/test_html_text.py index f2daac8..1205da7 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -39,7 +39,7 @@ def test_inline_tags_whitespace(all_options): def test_punct_whitespace(): html = u'
field, and more
' - assert extract_text(html) == u'field , and more' + assert extract_text(html, guess_punct_space=False) == u'field , and more' def test_punct_whitespace_preserved():