diff --git a/README.rst b/README.rst index 6f09e0e..77fdece 100644 --- a/README.rst +++ b/README.rst @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML or ``.get_text()`` from Beautiful Soup? Text extracted with ``html_text`` does not contain inline styles, javascript, comments and other text that is not normally visible to the users. +It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``, +adding spaces around inline elements too +(which are often used as block elements in html markup), +and tries to avoid adding extra spaces for punctuation. Apart from just getting text from the page (e.g. for display or search), one intended usage of this library is for machine learning (feature extraction). diff --git a/html_text/html_text.py b/html_text/html_text.py index 532c3ac..56d220e 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -1,4 +1,6 @@ # -*- coding: utf-8 -*- +import re + import lxml import lxml.etree from lxml.html.clean import Cleaner @@ -39,10 +41,33 @@ def parse_html(html): return lxml.html.fromstring(html.encode('utf8'), parser=parser) -def selector_to_text(sel): +_whitespace = re.compile(r'\s+') +_has_trailing_whitespace = re.compile(r'\s$').search +_has_punct_after = re.compile(r'^[,:;.!?"\)]').search +_has_punct_before = re.compile(r'\($').search + + +def selector_to_text(sel, guess_punct_space=True): """ Convert a cleaned selector to text. + See html_text.extract_text docstring for description of the approach and options. """ - return sel.xpath('normalize-space()').extract_first('') + if guess_punct_space: + + def fragments(): + prev = None + for text in sel.xpath('//text()').extract(): + if prev is not None and (_has_trailing_whitespace(prev) + or (not _has_punct_after(text) and + not _has_punct_before(prev))): + yield ' ' + yield text + prev = text + + return _whitespace.sub(' ', ''.join(fragments()).strip()) + + else: + fragments = (x.strip() for x in sel.xpath('//text()').extract()) + return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) def cleaned_selector(html): @@ -60,10 +85,18 @@ def cleaned_selector(html): return sel -def extract_text(html, encoding='utf8'): +def extract_text(html, guess_punct_space=True): """ Convert html to text. + Almost the same as normalize-space xpath, but this also + adds spaces between inline elements (like ) which are + often used as block elements in html markup. + + When guess_punct_space is True (default), no extra whitespace is added + for punctuation. This has a slight (around 10%) performance overhead + and is just a heuristic. html should be a unicode string or an already parsed lxml.html element. """ - return selector_to_text(cleaned_selector(html)) + sel = cleaned_selector(html) + return selector_to_text(sel, guess_punct_space=guess_punct_space) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index eedba9a..1205da7 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,25 +1,49 @@ # -*- coding: utf-8 -*- +import pytest from html_text import extract_text, parse_html -def test_extract_text(): +@pytest.fixture(params=[{'guess_punct_space': True}, + {'guess_punct_space': False}]) +def all_options(request): + return request.param + + +def test_extract_text(all_options): html = u'

Hello, world!' - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_declared_encoding(): +def test_declared_encoding(all_options): html = (u'' u'' u'Hello, world!

') - assert extract_text(html) == u'Hello, world!' + assert extract_text(html, **all_options) == u'Hello, world!' -def test_empty(): - assert extract_text(u'') == '' +def test_empty(all_options): + assert extract_text(u'', **all_options) == '' -def test_extract_text_from_tree(): +def test_extract_text_from_tree(all_options): html = u'

Hello, world!' tree = parse_html(html) - assert extract_text(tree) == u'Hello, world!' + assert extract_text(tree, **all_options) == u'Hello, world!' + + +def test_inline_tags_whitespace(all_options): + html = u'fieldvalue of' + assert extract_text(html, **all_options) == u'field value of' + + +def test_punct_whitespace(): + html = u'

field, and more
' + assert extract_text(html, guess_punct_space=False) == u'field , and more' + + +def test_punct_whitespace_preserved(): + html = (u'
поле, and , ' + u'more !now
a (boo)') + assert (extract_text(html, guess_punct_space=True) == + u'по ле, and , more ! now a (boo)')