Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix unwanted joins for inline tags #2

Merged
merged 6 commits into from
May 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
or ``.get_text()`` from Beautiful Soup?
Text extracted with ``html_text`` does not contain inline styles,
javascript, comments and other text that is not normally visible to the users.
It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
adding spaces around inline elements too
(which are often used as block elements in html markup),
and tries to avoid adding extra spaces for punctuation.


Install
Expand Down
41 changes: 37 additions & 4 deletions html_text/html_text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import re

import lxml
import lxml.etree
from lxml.html.clean import Cleaner
Expand Down Expand Up @@ -39,10 +41,33 @@ def parse_html(html):
return lxml.html.fromstring(html.encode('utf8'), parser=parser)


def selector_to_text(sel):
_whitespace = re.compile(r'\s+')
_has_trailing_whitespace = re.compile(r'\s$').search
_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
_has_punct_before = re.compile(r'\($').search


def selector_to_text(sel, guess_punct_space=True):
""" Convert a cleaned selector to text.
See html_text.extract_text docstring for description of the approach and options.
"""
return sel.xpath('normalize-space()').extract_first('')
if guess_punct_space:

def fragments():
prev = None
for text in sel.xpath('//text()').extract():

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd recommend using './/text()' so that it can be used for any selector, and not only those coming from extract_text(html)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a great idea, thanks @redapple - I'd like to also make it possible to pass selectors via the public interface.

if prev is not None and (_has_trailing_whitespace(prev)
or (not _has_punct_after(text) and
not _has_punct_before(prev))):
yield ' '
yield text
prev = text

return _whitespace.sub(' ', ''.join(fragments()).strip())

else:
fragments = (x.strip() for x in sel.xpath('//text()').extract())
return _whitespace.sub(' ', ' '.join(x for x in fragments if x))


def cleaned_selector(html):
Expand All @@ -60,10 +85,18 @@ def cleaned_selector(html):
return sel


def extract_text(html, encoding='utf8'):
def extract_text(html, guess_punct_space=True):
"""
Convert html to text.
Almost the same as normalize-space xpath, but this also
adds spaces between inline elements (like <span>) which are
often used as block elements in html markup.

When guess_punct_space is True (default), no extra whitespace is added
for punctuation. This has a slight (around 10%) performance overhead
and is just a heuristic.

html should be a unicode string or an already parsed lxml.html element.
"""
return selector_to_text(cleaned_selector(html))
sel = cleaned_selector(html)
return selector_to_text(sel, guess_punct_space=guess_punct_space)
40 changes: 32 additions & 8 deletions tests/test_html_text.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,49 @@
# -*- coding: utf-8 -*-
import pytest

from html_text import extract_text, parse_html


def test_extract_text():
@pytest.fixture(params=[{'guess_punct_space': True},
{'guess_punct_space': False}])
def all_options(request):
return request.param


def test_extract_text(all_options):
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
assert extract_text(html) == u'Hello, world!'
assert extract_text(html, **all_options) == u'Hello, world!'


def test_declared_encoding():
def test_declared_encoding(all_options):
html = (u'<?xml version="1.0" encoding="utf-8" ?>'
u'<html><style>.div {}</style>'
u'<body>Hello, world!</p></body></html>')
assert extract_text(html) == u'Hello, world!'
assert extract_text(html, **all_options) == u'Hello, world!'


def test_empty():
assert extract_text(u'') == ''
def test_empty(all_options):
assert extract_text(u'', **all_options) == ''


def test_extract_text_from_tree():
def test_extract_text_from_tree(all_options):
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
tree = parse_html(html)
assert extract_text(tree) == u'Hello, world!'
assert extract_text(tree, **all_options) == u'Hello, world!'


def test_inline_tags_whitespace(all_options):
html = u'<span>field</span><span>value of</span><span></span>'
assert extract_text(html, **all_options) == u'field value of'


def test_punct_whitespace():
html = u'<div><span>field</span>, and more</div>'
assert extract_text(html, guess_punct_space=False) == u'field , and more'


def test_punct_whitespace_preserved():
html = (u'<div><span>по</span><span>ле</span>, and , '
u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
assert (extract_text(html, guess_punct_space=True) ==
u'по ле, and , more ! now a (boo)')