From d17ec6cd1097fcfa8322b0665f0b411dcfd77ab0 Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Mon, 29 May 2017 12:22:34 +0300 Subject: [PATCH] guess_punct_space: remove whitespace before punct This is similar to webstruct.utils.smart_joins (https://github.com/scrapinghub/webstruct/blob/5a3f39e2ec78a04ca021a12dff58f66686d86251/webstruct/utils.py#L61), but is applied only on the tag boundaries. This mode is just a little bit slower than default. --- html_text/html_text.py | 28 +++++++++++++++++++++------- tests/test_html_text.py | 16 ++++++++++++++-- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 2b61a9a..a149ade 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -41,18 +41,31 @@ def parse_html(html): return lxml.html.fromstring(html.encode('utf8'), parser=parser) -_whitespace = re.compile('\s+') +_whitespace = re.compile(r'\s+') +_trailing_whitespace = re.compile(r'\s$') +_punct_after = re.compile(r'[,:;.!?"\)]') +_punct_before = re.compile(r'[\(]') -def selector_to_text(sel): +def selector_to_text(sel, guess_punct_space=False): """ Convert a cleaned selector to text. Almost the same as xpath normalize-space, but this also adds spaces between inline elements (like ) which are often used as block elements in html markup. """ - fragments = (_whitespace.sub(' ', x.strip()) - for x in sel.xpath('//text()').extract()) - return ' '.join(x for x in fragments if x) + if guess_punct_space: + fragments = [] + for text in sel.xpath('//text()').extract(): + if fragments and (_trailing_whitespace.search(fragments[-1]) + or (not _punct_after.match(text) and + not _punct_before.match(fragments[-1]))): + fragments.append(' ') + fragments.append(text) + return _whitespace.sub(' ', ''.join(fragments).strip()) + else: + fragments = (_whitespace.sub(' ', x.strip()) + for x in sel.xpath('//text()').extract()) + return ' '.join(x for x in fragments if x) def cleaned_selector(html): @@ -70,10 +83,11 @@ def cleaned_selector(html): return sel -def extract_text(html, encoding='utf8'): +def extract_text(html, guess_punct_space=False): """ Convert html to text. html should be a unicode string or an already parsed lxml.html element. """ - return selector_to_text(cleaned_selector(html)) + sel = cleaned_selector(html) + return selector_to_text(sel, guess_punct_space=guess_punct_space) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index b5078e3..bdbf465 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -26,5 +26,17 @@ def test_extract_text_from_tree(): def test_inline_tags_whitespace(): - html = u'fieldvalue' - assert extract_text(html) == u'field value' + html = u'fieldvalue of' + assert extract_text(html) == u'field value of' + + +def test_punct_whitespace(): + html = u'
field, and more
' + assert extract_text(html) == u'field , and more' + + +def test_punct_whitespace_preserved(): + html = (u'
поле, and , ' + u'more !now
(boo)') + assert (extract_text(html, guess_punct_space=True) == + u'по ле, and , more ! now (boo)')