Skip to content

Commit

Permalink
Merge pull request #2 from TeamHG-Memex/inline-tags-spaces
Browse files Browse the repository at this point in the history
Fix unwanted joins for inline tags
  • Loading branch information
lopuhin committed May 29, 2017
2 parents c7ebb57 + 1fb2ec4 commit cf48523
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 12 deletions.
4 changes: 4 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ How is html_text different from ``.xpath('//text()')`` from LXML
or ``.get_text()`` from Beautiful Soup?
Text extracted with ``html_text`` does not contain inline styles,
javascript, comments and other text that is not normally visible to the users.
It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
adding spaces around inline elements too
(which are often used as block elements in html markup),
and tries to avoid adding extra spaces for punctuation.

Apart from just getting text from the page (e.g. for display or search),
one intended usage of this library is for machine learning (feature extraction).
Expand Down
41 changes: 37 additions & 4 deletions html_text/html_text.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# -*- coding: utf-8 -*-
import re

import lxml
import lxml.etree
from lxml.html.clean import Cleaner
Expand Down Expand Up @@ -39,10 +41,33 @@ def parse_html(html):
return lxml.html.fromstring(html.encode('utf8'), parser=parser)


def selector_to_text(sel):
_whitespace = re.compile(r'\s+')
_has_trailing_whitespace = re.compile(r'\s$').search
_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
_has_punct_before = re.compile(r'\($').search


def selector_to_text(sel, guess_punct_space=True):
""" Convert a cleaned selector to text.
See html_text.extract_text docstring for description of the approach and options.
"""
return sel.xpath('normalize-space()').extract_first('')
if guess_punct_space:

def fragments():
prev = None
for text in sel.xpath('//text()').extract():
if prev is not None and (_has_trailing_whitespace(prev)
or (not _has_punct_after(text) and
not _has_punct_before(prev))):
yield ' '
yield text
prev = text

return _whitespace.sub(' ', ''.join(fragments()).strip())

else:
fragments = (x.strip() for x in sel.xpath('//text()').extract())
return _whitespace.sub(' ', ' '.join(x for x in fragments if x))


def cleaned_selector(html):
Expand All @@ -60,10 +85,18 @@ def cleaned_selector(html):
return sel


def extract_text(html, encoding='utf8'):
def extract_text(html, guess_punct_space=True):
"""
Convert html to text.
Almost the same as normalize-space xpath, but this also
adds spaces between inline elements (like <span>) which are
often used as block elements in html markup.
When guess_punct_space is True (default), no extra whitespace is added
for punctuation. This has a slight (around 10%) performance overhead
and is just a heuristic.
html should be a unicode string or an already parsed lxml.html element.
"""
return selector_to_text(cleaned_selector(html))
sel = cleaned_selector(html)
return selector_to_text(sel, guess_punct_space=guess_punct_space)
40 changes: 32 additions & 8 deletions tests/test_html_text.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,49 @@
# -*- coding: utf-8 -*-
import pytest

from html_text import extract_text, parse_html


def test_extract_text():
@pytest.fixture(params=[{'guess_punct_space': True},
{'guess_punct_space': False}])
def all_options(request):
return request.param


def test_extract_text(all_options):
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
assert extract_text(html) == u'Hello, world!'
assert extract_text(html, **all_options) == u'Hello, world!'


def test_declared_encoding():
def test_declared_encoding(all_options):
html = (u'<?xml version="1.0" encoding="utf-8" ?>'
u'<html><style>.div {}</style>'
u'<body>Hello, world!</p></body></html>')
assert extract_text(html) == u'Hello, world!'
assert extract_text(html, **all_options) == u'Hello, world!'


def test_empty():
assert extract_text(u'') == ''
def test_empty(all_options):
assert extract_text(u'', **all_options) == ''


def test_extract_text_from_tree():
def test_extract_text_from_tree(all_options):
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>'
tree = parse_html(html)
assert extract_text(tree) == u'Hello, world!'
assert extract_text(tree, **all_options) == u'Hello, world!'


def test_inline_tags_whitespace(all_options):
html = u'<span>field</span><span>value of</span><span></span>'
assert extract_text(html, **all_options) == u'field value of'


def test_punct_whitespace():
html = u'<div><span>field</span>, and more</div>'
assert extract_text(html, guess_punct_space=False) == u'field , and more'


def test_punct_whitespace_preserved():
html = (u'<div><span>по</span><span>ле</span>, and , '
u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
assert (extract_text(html, guess_punct_space=True) ==
u'по ле, and , more ! now a (boo)')

0 comments on commit cf48523

Please sign in to comment.