From 0ae6d24cf5ef038f9cf5913cb0cce0d5bb714cbd Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Fri, 24 Aug 2018 14:48:54 -0700 Subject: [PATCH 01/40] add first working approach plus debug code --- html_text/html_text.py | 80 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 77 insertions(+), 3 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 1b0462c..98920d0 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -6,7 +6,6 @@ from lxml.html.clean import Cleaner import parsel - _clean_html = Cleaner( scripts=True, javascript=False, # onclick attributes are fine @@ -69,6 +68,59 @@ def fragments(): fragments = (x.strip() for x in sel.xpath('.//text()').extract()) return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) +def selector_to_text_new(tree, guess_punct_space=True, guess_page_layout=False): + """ Convert a cleaned selector to text. + See html_text.extract_text docstring for description of the approach and options. + """ + + if guess_punct_space: + def add_newline(tag): + if tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']: + return '\n' + return '' + + def traverse_text_fragments(tree, prev): + space = '' + newline = '' + if tree.text: + text = _whitespace.sub(' ', tree.text.strip()) + if text: + if prev[0] is not None and (not _has_trailing_whitespace(prev[0]) + and (not _has_punct_after(tree.text) and + not _has_punct_before(prev[0]))): + space = ' ' + if guess_page_layout: + newline = add_newline(tree.tag) + yield [space, text, newline] + prev[0] = (newline or text) + space = '' + newline = '' + + for child in tree: # where is my precious "yield from"? + for t in traverse_text_fragments(child, prev): + yield t + + if tree.tail: + text = _whitespace.sub(' ', tree.tail.strip()) + if text: + if prev[0] is not None and (not _has_trailing_whitespace(prev[0]) + and (not _has_punct_after(tree.tail) and + not _has_punct_before(prev[0]))): + space = ' ' + if guess_page_layout: + newline = add_newline(tree.tag) + yield [space, text, newline] + prev[0] = (newline or text) + + text = [] + for fragment in traverse_text_fragments(tree, [None]): + text.extend(fragment) + return ''.join(text).strip() + + else: + # fragments = (x.strip() for x in sel.xpath('.//text()').extract()) + # return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) + pass def cleaned_selector(html): """ Clean selector. @@ -85,7 +137,7 @@ def cleaned_selector(html): return sel -def extract_text(html, guess_punct_space=True): +def extract_text(html, guess_punct_space=True, guess_page_layout=False): """ Convert html to text, cleaning invisible content such as styles. Almost the same as normalize-space xpath, but this also @@ -98,5 +150,27 @@ def extract_text(html, guess_punct_space=True): html should be a unicode string or an already parsed lxml.html element. """ + # from time import time + + + cleaned = _clean_html(html) + # t1 = time() + res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout) + # t2 = time() + # print('NEW') + # print('clean_time: ', t1 - t0) + # print('text_time: ', t2 - t1) + # print('total_time: ', t2 - t0) + # else: + # # t0 = time() sel = cleaned_selector(html) - return selector_to_text(sel, guess_punct_space=guess_punct_space) + # t1 = time() + old = selector_to_text(sel, guess_punct_space=guess_punct_space) + # t2 = time() + # print('OLD') + # print('clean_time: ', t1 - t0) + # print('text_time: ', t2 - t1) + # print('total_time: ', t2 - t0) + # print('') + # t0 = time() + return res, old \ No newline at end of file From 566dc9b71c14f92f0ae38b84282ec76116343199 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Fri, 24 Aug 2018 16:16:21 -0700 Subject: [PATCH 02/40] add newline only at the end of selected tags --- html_text/html_text.py | 70 +++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 98920d0..92f2bfd 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -74,43 +74,42 @@ def selector_to_text_new(tree, guess_punct_space=True, guess_page_layout=False): """ if guess_punct_space: - def add_newline(tag): - if tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']: - return '\n' - return '' def traverse_text_fragments(tree, prev): space = '' - newline = '' if tree.text: text = _whitespace.sub(' ', tree.text.strip()) if text: if prev[0] is not None and (not _has_trailing_whitespace(prev[0]) and (not _has_punct_after(tree.text) and - not _has_punct_before(prev[0]))): + not _has_punct_before(prev[0]))): space = ' ' - if guess_page_layout: - newline = add_newline(tree.tag) - yield [space, text, newline] - prev[0] = (newline or text) + + yield [space, text] + prev[0] = text space = '' - newline = '' for child in tree: # where is my precious "yield from"? for t in traverse_text_fragments(child, prev): yield t + + tail_text = [] + if guess_page_layout and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']: + tail_text.append('\n') + prev[0] = '\n' if tree.tail: text = _whitespace.sub(' ', tree.tail.strip()) if text: - if prev[0] is not None and (not _has_trailing_whitespace(prev[0]) - and (not _has_punct_after(tree.tail) and - not _has_punct_before(prev[0]))): - space = ' ' - if guess_page_layout: - newline = add_newline(tree.tag) - yield [space, text, newline] - prev[0] = (newline or text) + if (not tail_text and prev[0] is not None and + not _has_trailing_whitespace(prev[0]) and + not _has_punct_after(tree.tail) and + not _has_punct_before(prev[0])): + tail_text.append(' ') + tail_text.append(text) + prev[0] = text + if tail_text: + yield tail_text text = [] for fragment in traverse_text_fragments(tree, [None]): @@ -137,7 +136,7 @@ def cleaned_selector(html): return sel -def extract_text(html, guess_punct_space=True, guess_page_layout=False): +def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True): """ Convert html to text, cleaning invisible content such as styles. Almost the same as normalize-space xpath, but this also @@ -152,20 +151,21 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False): """ # from time import time - - cleaned = _clean_html(html) - # t1 = time() - res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout) - # t2 = time() - # print('NEW') - # print('clean_time: ', t1 - t0) - # print('text_time: ', t2 - t1) - # print('total_time: ', t2 - t0) - # else: - # # t0 = time() - sel = cleaned_selector(html) - # t1 = time() - old = selector_to_text(sel, guess_punct_space=guess_punct_space) + if new: + cleaned = _clean_html(html) + # t1 = time() + res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout) + # t2 = time() + # print('NEW') + # print('clean_time: ', t1 - t0) + # print('text_time: ', t2 - t1) + # print('total_time: ', t2 - t0) + # else: + # # t0 = time() + else: + sel = cleaned_selector(html) + # t1 = time() + res = selector_to_text(sel, guess_punct_space=guess_punct_space) # t2 = time() # print('OLD') # print('clean_time: ', t1 - t0) @@ -173,4 +173,4 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False): # print('total_time: ', t2 - t0) # print('') # t0 = time() - return res, old \ No newline at end of file + return res \ No newline at end of file From 587e9a713a1eb4448c6d11954bccda6f3281ff02 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 27 Aug 2018 12:04:03 -0700 Subject: [PATCH 03/40] fix multiple consecutive newlines --- html_text/html_text.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 92f2bfd..6bc1c23 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -94,7 +94,9 @@ def traverse_text_fragments(tree, prev): yield t tail_text = [] - if guess_page_layout and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']: + if (guess_page_layout + and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl'] + and prev[0] != '\n'): tail_text.append('\n') prev[0] = '\n' From 6c9d27e3623416a95950100d310600d0e88bbcb3 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 27 Aug 2018 13:34:09 -0700 Subject: [PATCH 04/40] add guess_space = False option --- html_text/__init__.py | 2 +- html_text/html_text.py | 160 ++++++++++++++--------------------------- 2 files changed, 55 insertions(+), 107 deletions(-) diff --git a/html_text/__init__.py b/html_text/__init__.py index db40e63..9c9d86a 100644 --- a/html_text/__init__.py +++ b/html_text/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text +from .html_text import extract_text, parse_html, html_to_text diff --git a/html_text/html_text.py b/html_text/html_text.py index 6bc1c23..62ecb2f 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -4,7 +4,6 @@ import lxml import lxml.etree from lxml.html.clean import Cleaner -import parsel _clean_html = Cleaner( scripts=True, @@ -46,96 +45,66 @@ def parse_html(html): _has_punct_before = re.compile(r'\($').search -def selector_to_text(sel, guess_punct_space=True): - """ Convert a cleaned selector to text. - See html_text.extract_text docstring for description of the approach and options. +def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): + """ Convert a cleaned html tree to text. + See html_text.extract_text docstring for description of the approach + and options. """ - if guess_punct_space: - def fragments(): - prev = None - for text in sel.xpath('.//text()').extract(): - if prev is not None and (_has_trailing_whitespace(prev) - or (not _has_punct_after(text) and - not _has_punct_before(prev))): - yield ' ' - yield text - prev = text - - return _whitespace.sub(' ', ''.join(fragments()).strip()) - - else: - fragments = (x.strip() for x in sel.xpath('.//text()').extract()) - return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) - -def selector_to_text_new(tree, guess_punct_space=True, guess_page_layout=False): - """ Convert a cleaned selector to text. - See html_text.extract_text docstring for description of the approach and options. - """ - - if guess_punct_space: - - def traverse_text_fragments(tree, prev): - space = '' - if tree.text: + def should_add_space(text, prev): + return (prev is not None + and (not _has_trailing_whitespace(prev) + and (not _has_punct_after(text) + and not _has_punct_before(prev) + ) + ) + ) + + def traverse_text_fragments(tree, prev): + space = '' + if tree.text: + if guess_punct_space: text = _whitespace.sub(' ', tree.text.strip()) - if text: - if prev[0] is not None and (not _has_trailing_whitespace(prev[0]) - and (not _has_punct_after(tree.text) and - not _has_punct_before(prev[0]))): - space = ' ' - - yield [space, text] - prev[0] = text - space = '' - - for child in tree: # where is my precious "yield from"? - for t in traverse_text_fragments(child, prev): - yield t - - tail_text = [] - if (guess_page_layout - and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl'] - and prev[0] != '\n'): - tail_text.append('\n') - prev[0] = '\n' - - if tree.tail: + if text and should_add_space(text, prev[0]): + space = ' ' + yield [space, text] + prev[0] = text + space = '' + else: + yield[tree.text] + + for child in tree: + for t in traverse_text_fragments(child, prev): + yield t + + tail_text = [] + if (guess_page_layout + and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl'] + and prev[0] != '\n' + ): + tail_text.append('\n') + prev[0] = '\n' + + if tree.tail: + if guess_punct_space: text = _whitespace.sub(' ', tree.tail.strip()) if text: - if (not tail_text and prev[0] is not None and - not _has_trailing_whitespace(prev[0]) and - not _has_punct_after(tree.tail) and - not _has_punct_before(prev[0])): + if (not tail_text # do not add space after newline + and should_add_space(text, prev[0])): tail_text.append(' ') + tail_text.append(text) prev[0] = text - if tail_text: - yield tail_text - - text = [] - for fragment in traverse_text_fragments(tree, [None]): - text.extend(fragment) - return ''.join(text).strip() + else: + tail_text.append(tree.tail) + if tail_text: + yield tail_text - else: - # fragments = (x.strip() for x in sel.xpath('.//text()').extract()) - # return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) - pass + text = [] + for fragment in traverse_text_fragments(tree, [None]): + text.extend(fragment) + return ''.join(text).strip() -def cleaned_selector(html): - """ Clean selector. - """ - try: - tree = _cleaned_html_tree(html) - sel = parsel.Selector(root=tree, type='html') - except (lxml.etree.XMLSyntaxError, - lxml.etree.ParseError, - lxml.etree.ParserError, - UnicodeEncodeError): - # likely plain text - sel = parsel.Selector(html) - return sel def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True): @@ -151,28 +120,7 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True html should be a unicode string or an already parsed lxml.html element. """ - # from time import time - - if new: - cleaned = _clean_html(html) - # t1 = time() - res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout) - # t2 = time() - # print('NEW') - # print('clean_time: ', t1 - t0) - # print('text_time: ', t2 - t1) - # print('total_time: ', t2 - t0) - # else: - # # t0 = time() - else: - sel = cleaned_selector(html) - # t1 = time() - res = selector_to_text(sel, guess_punct_space=guess_punct_space) - # t2 = time() - # print('OLD') - # print('clean_time: ', t1 - t0) - # print('text_time: ', t2 - t1) - # print('total_time: ', t2 - t0) - # print('') - # t0 = time() - return res \ No newline at end of file + if not html: + return '' + cleaned = _cleaned_html_tree(html) + return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout) From c22f3fa6ede49560c39e6ad702b310fe32b5ca62 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Tue, 28 Aug 2018 12:39:18 -0700 Subject: [PATCH 05/40] move add space and newline checks to a function --- html_text/html_text.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 62ecb2f..007a739 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -5,6 +5,10 @@ import lxml.etree from lxml.html.clean import Cleaner + +NEWLINE_TAGS = ['title', 'p', 'li', 'dd', 'dt', 'dl', 'ul', + 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + _clean_html = Cleaner( scripts=True, javascript=False, # onclick attributes are fine @@ -51,7 +55,7 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): and options. """ - def should_add_space(text, prev): + def add_space(text, prev): return (prev is not None and (not _has_trailing_whitespace(prev) and (not _has_punct_after(text) @@ -60,28 +64,29 @@ def should_add_space(text, prev): ) ) + def add_newline(tag, prev): + return tag in NEWLINE_TAGS and prev != '\n' + def traverse_text_fragments(tree, prev): space = '' if tree.text: if guess_punct_space: text = _whitespace.sub(' ', tree.text.strip()) - if text and should_add_space(text, prev[0]): + if text and add_space(text, prev[0]): space = ' ' yield [space, text] prev[0] = text space = '' else: - yield[tree.text] + yield [tree.text] + prev[0] = tree.text for child in tree: for t in traverse_text_fragments(child, prev): yield t tail_text = [] - if (guess_page_layout - and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl'] - and prev[0] != '\n' - ): + if guess_page_layout and add_newline(tree.tag, prev[0]): tail_text.append('\n') prev[0] = '\n' @@ -90,13 +95,14 @@ def traverse_text_fragments(tree, prev): text = _whitespace.sub(' ', tree.tail.strip()) if text: if (not tail_text # do not add space after newline - and should_add_space(text, prev[0])): + and add_space(text, prev[0])): tail_text.append(' ') tail_text.append(text) prev[0] = text else: tail_text.append(tree.tail) + prev[0] = tree.tail if tail_text: yield tail_text @@ -120,7 +126,7 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True html should be a unicode string or an already parsed lxml.html element. """ - if not html: + if html is None or len(html) == 0: return '' cleaned = _cleaned_html_tree(html) return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout) From 8a78fc58c5d6a721489038158564112b24bd3f49 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Tue, 28 Aug 2018 12:39:40 -0700 Subject: [PATCH 06/40] add tests guess_page_layout --- tests/test_html_text.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 594ea3c..648c319 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,11 +1,15 @@ # -*- coding: utf-8 -*- import pytest -from html_text import extract_text, parse_html, cleaned_selector, selector_to_text +from html_text import extract_text, html_to_text, parse_html @pytest.fixture(params=[{'guess_punct_space': True}, - {'guess_punct_space': False}]) + {'guess_punct_space': False}, + {'guess_punct_space': True, 'guess_page_layout': True}, + {'guess_punct_space': False, 'guess_page_layout': True} + ]) + def all_options(request): return request.param @@ -49,9 +53,27 @@ def test_punct_whitespace_preserved(): u'по ле, and , more ! now a (boo)') -def test_selector(all_options): - html = '
text
more
and more text
' - sel = cleaned_selector(html) - assert selector_to_text(sel, **all_options) == 'text more and more text' - subsel = sel.xpath('//div[@id="extract-me"]')[0] - assert selector_to_text(subsel, **all_options) == 'text more' +# def test_selector(all_options): +# html = '
text
more
and more text
' +# sel = cleaned_selector(html) +# assert selector_to_text(sel, **all_options) == 'text more and more text' +# subsel = sel.xpath('//div[@id="extract-me"]')[0] +# assert selector_to_text(subsel, **all_options) == 'text more' + +def test_guess_page_layout(): + html = (u'title
text_1.

text_2 text_3

    ' + '
  • text_4
  • text_5

text_6text_7' + 'text_8

text_9

...text_10

' + ) + assert (extract_text(html, guess_punct_space=False) == + ('titletext_1.text_2 text_3text_4text_5' + 'text_6text_7text_8text_9...text_10')) + assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) == + ('title\ntext_1.text_2 text_3\ntext_4\ntext_5' + '\ntext_6text_7text_8\ntext_9...text_10')) + assert (extract_text(html, guess_punct_space=True) == + ('title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9...text_10')) + assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) == + ('title\ntext_1. text_2 text_3\ntext_4\ntext_5' + '\ntext_6 text_7 text_8\ntext_9...text_10')) From a783e3134265d7653b2cd31a2f97d4d4a3e8ae6d Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 29 Aug 2018 11:32:24 -0700 Subject: [PATCH 07/40] remove old test --- tests/test_html_text.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 648c319..ec0cdc6 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -53,13 +53,6 @@ def test_punct_whitespace_preserved(): u'по ле, and , more ! now a (boo)') -# def test_selector(all_options): -# html = '
text
more
and more text
' -# sel = cleaned_selector(html) -# assert selector_to_text(sel, **all_options) == 'text more and more text' -# subsel = sel.xpath('//div[@id="extract-me"]')[0] -# assert selector_to_text(subsel, **all_options) == 'text more' - def test_guess_page_layout(): html = (u'title
text_1.

text_2 text_3

    ' '
  • text_4
  • text_5

text_6text_7' From cb8dc1cf3831c11b1408006fcb56c2c74795a716 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 30 Aug 2018 13:13:06 -0700 Subject: [PATCH 08/40] guess_punct_space = False behavior same as before this PR --- html_text/html_text.py | 49 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 007a739..28d4592 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -56,6 +56,7 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): """ def add_space(text, prev): + # return True if a space should be added return (prev is not None and (not _has_trailing_whitespace(prev) and (not _has_punct_after(text) @@ -68,43 +69,41 @@ def add_newline(tag, prev): return tag in NEWLINE_TAGS and prev != '\n' def traverse_text_fragments(tree, prev): - space = '' + space = ' ' if tree.text: - if guess_punct_space: - text = _whitespace.sub(' ', tree.text.strip()) - if text and add_space(text, prev[0]): - space = ' ' + text = _whitespace.sub(' ', tree.text.strip()) + if text: + if guess_punct_space and not add_space(text, prev[0]): + space = '' yield [space, text] prev[0] = text - space = '' - else: - yield [tree.text] - prev[0] = tree.text + space = ' ' for child in tree: for t in traverse_text_fragments(child, prev): yield t - tail_text = [] + newline = '' if guess_page_layout and add_newline(tree.tag, prev[0]): - tail_text.append('\n') + newline = '\n' prev[0] = '\n' + tail = '' if tree.tail: - if guess_punct_space: - text = _whitespace.sub(' ', tree.tail.strip()) - if text: - if (not tail_text # do not add space after newline - and add_space(text, prev[0])): - tail_text.append(' ') - - tail_text.append(text) - prev[0] = text - else: - tail_text.append(tree.tail) - prev[0] = tree.tail - if tail_text: - yield tail_text + tail = _whitespace.sub(' ', tree.tail.strip()) + if tail: + if (guess_punct_space + and (not add_space(tail, prev[0]) or newline)): + space = '' + + if tail: + yield [newline, space, tail] + prev[0] = tail + # space = ' ' + # newline = '' + elif newline: + yield [newline] + # newline = '' text = [] for fragment in traverse_text_fragments(tree, [None]): From fb599bcc08c7c0af5fba6f000b01390046cd41cd Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 30 Aug 2018 13:13:13 -0700 Subject: [PATCH 09/40] fix tests --- tests/test_html_text.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index ec0cdc6..455ebff 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -54,16 +54,16 @@ def test_punct_whitespace_preserved(): def test_guess_page_layout(): - html = (u'title

text_1.

text_2 text_3

    ' - '
  • text_4
  • text_5

text_6text_7' - 'text_8

text_9

...text_10

' + html = (u' title
text_1.

text_2 text_3

    ' + '
  • text_4
  • text_5

text_6text_7' + 'text_8

text_9

...text_10

' ) assert (extract_text(html, guess_punct_space=False) == - ('titletext_1.text_2 text_3text_4text_5' - 'text_6text_7text_8text_9...text_10')) + ('title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9 ...text_10')) assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) == - ('title\ntext_1.text_2 text_3\ntext_4\ntext_5' - '\ntext_6text_7text_8\ntext_9...text_10')) + ('title\n text_1. text_2 text_3\n text_4\n text_5' + '\n text_6 text_7 text_8\n text_9 ...text_10')) assert (extract_text(html, guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' ' text_6 text_7 text_8 text_9...text_10')) From 90e37b76a17142f26011100ca5c67d33017fd5d4 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 30 Aug 2018 13:36:34 -0700 Subject: [PATCH 10/40] fixed tests --- html_text/html_text.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 28d4592..f52cbe0 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -46,7 +46,7 @@ def parse_html(html): _whitespace = re.compile(r'\s+') _has_trailing_whitespace = re.compile(r'\s$').search _has_punct_after = re.compile(r'^[,:;.!?"\)]').search -_has_punct_before = re.compile(r'\($').search +_has_open_bracket_before = re.compile(r'\($').search def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): @@ -57,10 +57,12 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): def add_space(text, prev): # return True if a space should be added + if prev == '\n': + return False return (prev is not None - and (not _has_trailing_whitespace(prev) - and (not _has_punct_after(text) - and not _has_punct_before(prev) + and (_has_trailing_whitespace(prev) + or (not _has_punct_after(text) + and not _has_open_bracket_before(prev) ) ) ) @@ -76,7 +78,7 @@ def traverse_text_fragments(tree, prev): if guess_punct_space and not add_space(text, prev[0]): space = '' yield [space, text] - prev[0] = text + prev[0] = tree.text space = ' ' for child in tree: @@ -98,12 +100,9 @@ def traverse_text_fragments(tree, prev): if tail: yield [newline, space, tail] - prev[0] = tail - # space = ' ' - # newline = '' + prev[0] = tree.tail elif newline: yield [newline] - # newline = '' text = [] for fragment in traverse_text_fragments(tree, [None]): From ae26d29ea693a87498595b0af5609f223e0b7590 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 30 Aug 2018 16:46:34 -0700 Subject: [PATCH 11/40] fix indent and make add_space more readable --- html_text/html_text.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index f52cbe0..3943835 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -50,22 +50,21 @@ def parse_html(html): def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): - """ Convert a cleaned html tree to text. - See html_text.extract_text docstring for description of the approach - and options. + """ + Convert a cleaned html tree to text. + See html_text.extract_text docstring for description of the approach + and options. """ def add_space(text, prev): - # return True if a space should be added + if prev is None: + return False if prev == '\n': return False - return (prev is not None - and (_has_trailing_whitespace(prev) - or (not _has_punct_after(text) - and not _has_open_bracket_before(prev) - ) - ) - ) + if not _has_trailing_whitespace(prev): + if _has_punct_after(text) or _has_open_bracket_before(prev): + return False + return True def add_newline(tag, prev): return tag in NEWLINE_TAGS and prev != '\n' From bb33d4b51d77a6bcd9a7fb107f2033a6d83c5133 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 30 Aug 2018 17:25:52 -0700 Subject: [PATCH 12/40] add double newline before and after title, p and h tags --- html_text/html_text.py | 40 ++++++++++++++++++++++++++-------------- tests/test_html_text.py | 12 ++++++------ 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 3943835..cb09a9a 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -6,8 +6,8 @@ from lxml.html.clean import Cleaner -NEWLINE_TAGS = ['title', 'p', 'li', 'dd', 'dt', 'dl', 'ul', - 'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] +NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol'] +DOUBLE_NEWLINE_TAGS = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] _clean_html = Cleaner( scripts=True, @@ -59,7 +59,7 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): def add_space(text, prev): if prev is None: return False - if prev == '\n': + if prev == '\n' or prev == '\n\n': return False if not _has_trailing_whitespace(prev): if _has_punct_after(text) or _has_open_bracket_before(prev): @@ -67,36 +67,50 @@ def add_space(text, prev): return True def add_newline(tag, prev): - return tag in NEWLINE_TAGS and prev != '\n' + if prev is None or prev == '\n\n': + return '' + if tag in DOUBLE_NEWLINE_TAGS: + if prev == '\n': + return '\n' + return '\n\n' + if tag in NEWLINE_TAGS: + if prev == '\n': + return '' + return '\n' + return '' def traverse_text_fragments(tree, prev): space = ' ' + newline = '' if tree.text: text = _whitespace.sub(' ', tree.text.strip()) if text: + if guess_page_layout: + newline = add_newline(tree.tag, prev[0]) + if newline: + prev[0] = newline if guess_punct_space and not add_space(text, prev[0]): space = '' - yield [space, text] + yield [newline, space, text] prev[0] = tree.text space = ' ' + newline = '' for child in tree: for t in traverse_text_fragments(child, prev): yield t - newline = '' - if guess_page_layout and add_newline(tree.tag, prev[0]): - newline = '\n' - prev[0] = '\n' + if guess_page_layout: + newline = add_newline(tree.tag, prev[0]) + if newline: + prev[0] = newline tail = '' if tree.tail: tail = _whitespace.sub(' ', tree.tail.strip()) if tail: - if (guess_punct_space - and (not add_space(tail, prev[0]) or newline)): + if guess_punct_space and not add_space(tail, prev[0]): space = '' - if tail: yield [newline, space, tail] prev[0] = tree.tail @@ -108,8 +122,6 @@ def traverse_text_fragments(tree, prev): text.extend(fragment) return ''.join(text).strip() - - def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True): """ Convert html to text, cleaning invisible content such as styles. diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 455ebff..96ce01d 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -62,11 +62,11 @@ def test_guess_page_layout(): ('title text_1. text_2 text_3 text_4 text_5' ' text_6 text_7 text_8 text_9 ...text_10')) assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) == - ('title\n text_1. text_2 text_3\n text_4\n text_5' - '\n text_6 text_7 text_8\n text_9 ...text_10')) + ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' + '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) assert (extract_text(html, guess_punct_space=True) == - ('title text_1. text_2 text_3 text_4 text_5' - ' text_6 text_7 text_8 text_9...text_10')) + ('title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9...text_10')) assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) == - ('title\ntext_1. text_2 text_3\ntext_4\ntext_5' - '\ntext_6 text_7 text_8\ntext_9...text_10')) + ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' + '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) From 3069a7339ac29731e32ce5f26ca31351cf6cf41b Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 6 Sep 2018 07:58:16 -0700 Subject: [PATCH 13/40] by default tail of root node will not be extracted --- html_text/__init__.py | 2 +- html_text/html_text.py | 35 ++++++++++++++++++++++++++++++----- tests/test_html_text.py | 10 +++++++++- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/html_text/__init__.py b/html_text/__init__.py index 9c9d86a..661a8a1 100644 --- a/html_text/__init__.py +++ b/html_text/__init__.py @@ -1,3 +1,3 @@ # -*- coding: utf-8 -*- -from .html_text import extract_text, parse_html, html_to_text +from .html_text import extract_text, parse_html, html_to_text, cleaned_selector, selector_to_text diff --git a/html_text/html_text.py b/html_text/html_text.py index cb09a9a..6053480 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -4,6 +4,7 @@ import lxml import lxml.etree from lxml.html.clean import Cleaner +import parsel NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol'] @@ -79,7 +80,7 @@ def add_newline(tag, prev): return '\n' return '' - def traverse_text_fragments(tree, prev): + def traverse_text_fragments(tree, prev, depth): space = ' ' newline = '' if tree.text: @@ -97,7 +98,7 @@ def traverse_text_fragments(tree, prev): newline = '' for child in tree: - for t in traverse_text_fragments(child, prev): + for t in traverse_text_fragments(child, prev, depth+1): yield t if guess_page_layout: @@ -106,7 +107,7 @@ def traverse_text_fragments(tree, prev): prev[0] = newline tail = '' - if tree.tail: + if tree.tail and depth != 0: tail = _whitespace.sub(' ', tree.tail.strip()) if tail: if guess_punct_space and not add_space(tail, prev[0]): @@ -118,10 +119,34 @@ def traverse_text_fragments(tree, prev): yield [newline] text = [] - for fragment in traverse_text_fragments(tree, [None]): + for fragment in traverse_text_fragments(tree, [None], 0): text.extend(fragment) return ''.join(text).strip() + +def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): + """ Convert a cleaned selector to text. + See html_text.extract_text docstring for description of the approach and options. + """ + return html_to_text(sel.root, guess_punct_space=guess_punct_space, + guess_page_layout=guess_page_layout) + + +def cleaned_selector(html): + """ Clean selector. + """ + try: + tree = _cleaned_html_tree(html) + sel = parsel.Selector(root=tree, type='html') + except (lxml.etree.XMLSyntaxError, + lxml.etree.ParseError, + lxml.etree.ParserError, + UnicodeEncodeError): + # likely plain text + sel = parsel.Selector(html) + return sel + + def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True): """ Convert html to text, cleaning invisible content such as styles. @@ -138,4 +163,4 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True if html is None or len(html) == 0: return '' cleaned = _cleaned_html_tree(html) - return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout) + return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout,) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 96ce01d..ff81fa7 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- import pytest -from html_text import extract_text, html_to_text, parse_html +from html_text import (extract_text, html_to_text, parse_html, parse_html, + cleaned_selector, selector_to_text) @pytest.fixture(params=[{'guess_punct_space': True}, @@ -52,6 +53,13 @@ def test_punct_whitespace_preserved(): assert (extract_text(html, guess_punct_space=True) == u'по ле, and , more ! now a (boo)') +def test_selector(all_options): + html = '
text
more
and more text
' + sel = cleaned_selector(html) + assert selector_to_text(sel, **all_options) == 'text more and more text' + subsel = sel.xpath('//div[@id="extract-me"]')[0] + assert selector_to_text(subsel, **all_options) == 'text more' + def test_guess_page_layout(): html = (u' title
text_1.

text_2 text_3

    ' From dd032013843361d5d3e9eedad231ff2116c7e4e9 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 6 Sep 2018 12:18:29 -0700 Subject: [PATCH 14/40] add test --- tests/test_html_text.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index ff81fa7..6f180f7 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- import pytest +import lxml from html_text import (extract_text, html_to_text, parse_html, parse_html, cleaned_selector, selector_to_text) @@ -61,20 +62,34 @@ def test_selector(all_options): assert selector_to_text(subsel, **all_options) == 'text more' -def test_guess_page_layout(): +def test_html_to_text(): html = (u' title
    text_1.

    text_2 text_3

      ' '
    • text_4
    • text_5

    text_6text_7' - 'text_8

    text_9

    ...text_10

    ' - ) - assert (extract_text(html, guess_punct_space=False) == + 'text_8

    text_9

...text_10

') + + parser = lxml.html.HTMLParser(encoding='utf8') + tree = lxml.html.fromstring(html.encode('utf8'), parser=parser) + + assert (html_to_text(tree, guess_punct_space=False) == ('title text_1. text_2 text_3 text_4 text_5' ' text_6 text_7 text_8 text_9 ...text_10')) - assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) == + assert (html_to_text(tree, guess_punct_space=False, guess_page_layout=True) == ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) - assert (extract_text(html, guess_punct_space=True) == + assert (html_to_text(tree, guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' ' text_6 text_7 text_8 text_9...text_10')) + assert (html_to_text(tree, guess_punct_space=True, guess_page_layout=True) == + ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' + '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) + +def test_guess_page_layout(): + html = (u' title
text_1.

text_2 text_3

' + '

  • text_4
  • text_5
' + '

text_6text_7text_8

text_9
' + '

...text_10

' + ) assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) == ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) From 0f2fb2b840d71538cf874027a8026167fd96d2e6 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Fri, 7 Sep 2018 12:45:17 -0700 Subject: [PATCH 15/40] fix indentation --- tests/test_html_text.py | 72 +++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 6f180f7..9a64aea 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -2,16 +2,21 @@ import pytest import lxml -from html_text import (extract_text, html_to_text, parse_html, parse_html, +from html_text import (extract_text, html_to_text, parse_html, cleaned_selector, selector_to_text) -@pytest.fixture(params=[{'guess_punct_space': True}, - {'guess_punct_space': False}, - {'guess_punct_space': True, 'guess_page_layout': True}, - {'guess_punct_space': False, 'guess_page_layout': True} - ]) - +@pytest.fixture(params=[{ + 'guess_punct_space': True +}, { + 'guess_punct_space': False +}, { + 'guess_punct_space': True, + 'guess_page_layout': True +}, { + 'guess_punct_space': False, + 'guess_page_layout': True +}]) def all_options(request): return request.param @@ -51,8 +56,9 @@ def test_punct_whitespace(): def test_punct_whitespace_preserved(): html = (u'
поле, and , ' u'more !now
a (boo)') - assert (extract_text(html, guess_punct_space=True) == - u'по ле, and , more ! now a (boo)') + assert (extract_text( + html, guess_punct_space=True) == u'по ле, and , more ! now a (boo)') + def test_selector(all_options): html = '
text
more
and more text
' @@ -64,32 +70,36 @@ def test_selector(all_options): def test_html_to_text(): html = (u' title
text_1.

text_2 text_3

    ' - '
  • text_4
  • text_5

text_6text_7' - 'text_8

text_9

...text_10

') + '
  • text_4
  • text_5
  • text_6text_7' + 'text_8

    text_9

    ...text_10

    ') parser = lxml.html.HTMLParser(encoding='utf8') tree = lxml.html.fromstring(html.encode('utf8'), parser=parser) - assert (html_to_text(tree, guess_punct_space=False) == - ('title text_1. text_2 text_3 text_4 text_5' - ' text_6 text_7 text_8 text_9 ...text_10')) - assert (html_to_text(tree, guess_punct_space=False, guess_page_layout=True) == - ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' - '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) - assert (html_to_text(tree, guess_punct_space=True) == - ('title text_1. text_2 text_3 text_4 text_5' - ' text_6 text_7 text_8 text_9...text_10')) - assert (html_to_text(tree, guess_punct_space=True, guess_page_layout=True) == - ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' - '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) + assert (html_to_text(tree, guess_punct_space=False) == ( + 'title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9 ...text_10')) + assert (html_to_text( + tree, guess_punct_space=False, guess_page_layout=True) == ( + 'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' + '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) + assert (html_to_text( + tree, + guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9...text_10')) + assert (html_to_text( + tree, guess_punct_space=True, guess_page_layout=True) == ( + 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' + '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) + def test_guess_page_layout(): html = (u' title
    text_1.

    text_2 text_3

    ' - '

    • text_4
    • text_5
    ' - '

    text_6text_7text_8

    text_9
    ' - '

    ...text_10

    ' - ) - assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) == - ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' - '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) + '

    • text_4
    • text_5
    ' + '

    text_6text_7text_8

    text_9' + '

    ...text_10

    ') + assert (extract_text( + html, guess_punct_space=True, guess_page_layout=True) == ( + 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' + '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) From e8da507281e06487fc635e648a34403a11f858f6 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Fri, 7 Sep 2018 12:46:55 -0700 Subject: [PATCH 16/40] newline tags as set and extendable, add new features comments, delete new argument --- html_text/html_text.py | 43 +++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 6053480..9805dcd 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -6,9 +6,9 @@ from lxml.html.clean import Cleaner import parsel - -NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol'] -DOUBLE_NEWLINE_TAGS = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] +NEWLINE_TAGS = frozenset(['li', 'dd', 'dt', 'dl', 'ul', 'ol']) +DOUBLE_NEWLINE_TAGS = frozenset( + ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) _clean_html = Cleaner( scripts=True, @@ -98,7 +98,7 @@ def traverse_text_fragments(tree, prev, depth): newline = '' for child in tree: - for t in traverse_text_fragments(child, prev, depth+1): + for t in traverse_text_fragments(child, prev, depth + 1): yield t if guess_page_layout: @@ -126,10 +126,13 @@ def traverse_text_fragments(tree, prev, depth): def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): """ Convert a cleaned selector to text. - See html_text.extract_text docstring for description of the approach and options. + See html_text.extract_text docstring for description of the approach + and options. """ - return html_to_text(sel.root, guess_punct_space=guess_punct_space, - guess_page_layout=guess_page_layout) + return html_to_text( + sel.root, + guess_punct_space=guess_punct_space, + guess_page_layout=guess_page_layout) def cleaned_selector(html): @@ -138,16 +141,18 @@ def cleaned_selector(html): try: tree = _cleaned_html_tree(html) sel = parsel.Selector(root=tree, type='html') - except (lxml.etree.XMLSyntaxError, - lxml.etree.ParseError, - lxml.etree.ParserError, - UnicodeEncodeError): + except (lxml.etree.XMLSyntaxError, lxml.etree.ParseError, + lxml.etree.ParserError, UnicodeEncodeError): # likely plain text sel = parsel.Selector(html) return sel -def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True): +def extract_text(html, + guess_punct_space=True, + guess_page_layout=False, + newline_tags=NEWLINE_TAGS, + double_newline_tags=DOUBLE_NEWLINE_TAGS): """ Convert html to text, cleaning invisible content such as styles. Almost the same as normalize-space xpath, but this also @@ -158,9 +163,21 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True for punctuation. This has a slight (around 10%) performance overhead and is just a heuristic. + When guess_page_layout is True (default is False), a newline is added after + NEWLINE_TAGS and two newlines after DOUBLE_NEWLINE_TAGS. This heuristic + makes the extracted text more similar to how it looks like in the browser. + + NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized. + html should be a unicode string or an already parsed lxml.html element. """ if html is None or len(html) == 0: return '' cleaned = _cleaned_html_tree(html) - return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout,) + return html_to_text( + cleaned, + guess_punct_space=guess_punct_space, + guess_page_layout=guess_page_layout, + newline_tags=newline_tags, + double_newline_tags=double_newline_tags, + ) From 0b9d1398639f836c9127b2310180850083312892 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Fri, 7 Sep 2018 17:22:18 -0700 Subject: [PATCH 17/40] make html_to_text private, fix its signature --- html_text/__init__.py | 3 ++- html_text/html_text.py | 14 +++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/html_text/__init__.py b/html_text/__init__.py index 661a8a1..61ef192 100644 --- a/html_text/__init__.py +++ b/html_text/__init__.py @@ -1,3 +1,4 @@ # -*- coding: utf-8 -*- -from .html_text import extract_text, parse_html, html_to_text, cleaned_selector, selector_to_text +from .html_text import (extract_text, parse_html, cleaned_selector, + selector_to_text) diff --git a/html_text/html_text.py b/html_text/html_text.py index 9805dcd..cbb32e1 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -50,7 +50,11 @@ def parse_html(html): _has_open_bracket_before = re.compile(r'\($').search -def html_to_text(tree, guess_punct_space=True, guess_page_layout=False): +def _html_to_text(tree, + guess_punct_space=True, + guess_page_layout=False, + newline_tags=NEWLINE_TAGS, + double_newline_tags=DOUBLE_NEWLINE_TAGS): """ Convert a cleaned html tree to text. See html_text.extract_text docstring for description of the approach @@ -70,11 +74,11 @@ def add_space(text, prev): def add_newline(tag, prev): if prev is None or prev == '\n\n': return '' - if tag in DOUBLE_NEWLINE_TAGS: + if tag in double_newline_tags: if prev == '\n': return '\n' return '\n\n' - if tag in NEWLINE_TAGS: + if tag in newline_tags: if prev == '\n': return '' return '\n' @@ -129,7 +133,7 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): See html_text.extract_text docstring for description of the approach and options. """ - return html_to_text( + return _html_to_text( sel.root, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout) @@ -174,7 +178,7 @@ def extract_text(html, if html is None or len(html) == 0: return '' cleaned = _cleaned_html_tree(html) - return html_to_text( + return _html_to_text( cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout, From ba7cdc0f39cb1c1483d0afd0b8183b8da6d8484b Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Fri, 7 Sep 2018 17:59:01 -0700 Subject: [PATCH 18/40] add new tags to handle --- html_text/html_text.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index cbb32e1..8b56252 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -6,9 +6,14 @@ from lxml.html.clean import Cleaner import parsel -NEWLINE_TAGS = frozenset(['li', 'dd', 'dt', 'dl', 'ul', 'ol']) -DOUBLE_NEWLINE_TAGS = frozenset( - ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) +NEWLINE_TAGS = frozenset([ + 'br', 'article', 'aside', 'details', 'div', 'dd', 'dt', 'fieldset', + 'figcaption', 'form', 'hr', 'li', 'main', 'nav', 'table', 'tr' +]) +DOUBLE_NEWLINE_TAGS = frozenset([ + 'blockquote', 'dl', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', + 'ol', 'ul', 'p', 'pre', 'title', 'figure' +]) _clean_html = Cleaner( scripts=True, From 952d8957030b5c5413f4eba7b1ba5743f9b5a4a1 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 16:45:21 -0700 Subject: [PATCH 19/40] handle more tags --- html_text/html_text.py | 42 ++++++++++++++++++---------- tests/test_html_text.py | 62 ++++++++++++++++------------------------- 2 files changed, 51 insertions(+), 53 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 8b56252..6dd7e36 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -7,12 +7,19 @@ import parsel NEWLINE_TAGS = frozenset([ - 'br', 'article', 'aside', 'details', 'div', 'dd', 'dt', 'fieldset', - 'figcaption', 'form', 'hr', 'li', 'main', 'nav', 'table', 'tr' + 'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset', + 'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main', + 'nav', 'table', 'tr' ]) DOUBLE_NEWLINE_TAGS = frozenset([ - 'blockquote', 'dl', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', - 'ol', 'ul', 'p', 'pre', 'title', 'figure' + 'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', + 'p', 'pre', 'title', 'ul' +]) +INLINE_TEXT_TAGS = frozenset([ + 'abbr', 'acronym', 'b', 'bdi', 'bdo', 'cite', 'code', 'data', 'del', 'dfn', + 'em', 'i', 'ins', 'kbd', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', + 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'tt', 'u', 'var', + 'wbr', ]) _clean_html = Cleaner( @@ -30,6 +37,7 @@ annoying_tags=False, remove_unknown_tags=False, safe_attrs_only=False, + remove_tags=INLINE_TEXT_TAGS, # helps newline placement if guess_page_layout=True ).clean_html @@ -92,19 +100,23 @@ def add_newline(tag, prev): def traverse_text_fragments(tree, prev, depth): space = ' ' newline = '' + text = '' + if guess_page_layout: + newline = add_newline(tree.tag, prev[0]) + if newline: + prev[0] = newline if tree.text: text = _whitespace.sub(' ', tree.text.strip()) - if text: - if guess_page_layout: - newline = add_newline(tree.tag, prev[0]) - if newline: - prev[0] = newline - if guess_punct_space and not add_space(text, prev[0]): - space = '' - yield [newline, space, text] - prev[0] = tree.text - space = ' ' - newline = '' + if text and guess_punct_space and not add_space(text, prev[0]): + space = '' + if text: + yield [newline, space, text] + prev[0] = tree.text + space = ' ' + newline = '' + elif newline: + yield [newline] + newline = '' for child in tree: for t in traverse_text_fragments(child, prev, depth + 1): diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 9a64aea..995d673 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -2,8 +2,8 @@ import pytest import lxml -from html_text import (extract_text, html_to_text, parse_html, - cleaned_selector, selector_to_text) +from html_text import (extract_text, parse_html, cleaned_selector, + selector_to_text) @pytest.fixture(params=[{ @@ -67,39 +67,25 @@ def test_selector(all_options): subsel = sel.xpath('//div[@id="extract-me"]')[0] assert selector_to_text(subsel, **all_options) == 'text more' - -def test_html_to_text(): - html = (u' title
    text_1.

    text_2 text_3

      ' - '
    • text_4
    • text_5

    text_6text_7' - 'text_8

    text_9

    ...text_10

    ') - - parser = lxml.html.HTMLParser(encoding='utf8') - tree = lxml.html.fromstring(html.encode('utf8'), parser=parser) - - assert (html_to_text(tree, guess_punct_space=False) == ( - 'title text_1. text_2 text_3 text_4 text_5' - ' text_6 text_7 text_8 text_9 ...text_10')) - assert (html_to_text( - tree, guess_punct_space=False, guess_page_layout=True) == ( - 'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' - '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) - assert (html_to_text( - tree, - guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' - ' text_6 text_7 text_8 text_9...text_10')) - assert (html_to_text( - tree, guess_punct_space=True, guess_page_layout=True) == ( - 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' - '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) - - -def test_guess_page_layout(): - html = (u' title
    text_1.

    text_2 text_3

    ' - '

    • text_4
    • text_5
    ' - '

    text_6text_7text_8

    text_9
    ' - '

    ...text_10

    ') - assert (extract_text( - html, guess_punct_space=True, guess_page_layout=True) == ( - 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' - '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) +# +# def test_guess_page_layout(): +# html = (u' title
    text_1.

    text_2 text_3

    ' +# '

    • text_4
    • text_5
    ' +# '

    text_6text_7text_8

    text_9
    ' +# '

    ...text_10

    ') +# assert (extract_text(html, guess_punct_space=False) == ( +# 'title text_1. text_2 text_3 text_4 text_5' +# ' text_6 text_7 text_8 text_9 ...text_10')) +# assert (extract_text( +# html, guess_punct_space=False, guess_page_layout=True) == ( +# 'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' +# '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) +# assert (extract_text( +# html, +# guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' +# ' text_6 text_7 text_8 text_9...text_10')) +# assert (extract_text( +# html, guess_punct_space=True, guess_page_layout=True) == ( +# 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' +# '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) From 9dafbf026c8f0e1f2a3b91d88f9dd10c1c5d6a1f Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 17:22:30 -0700 Subject: [PATCH 20/40] remove cleaning of inline tags --- html_text/html_text.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 6dd7e36..c1dc717 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -15,12 +15,6 @@ 'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'p', 'pre', 'title', 'ul' ]) -INLINE_TEXT_TAGS = frozenset([ - 'abbr', 'acronym', 'b', 'bdi', 'bdo', 'cite', 'code', 'data', 'del', 'dfn', - 'em', 'i', 'ins', 'kbd', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's', - 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'tt', 'u', 'var', - 'wbr', -]) _clean_html = Cleaner( scripts=True, @@ -37,7 +31,6 @@ annoying_tags=False, remove_unknown_tags=False, safe_attrs_only=False, - remove_tags=INLINE_TEXT_TAGS, # helps newline placement if guess_page_layout=True ).clean_html From b3229d6add78590b0c25a44663ce04c582c8f5ba Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 17:41:16 -0700 Subject: [PATCH 21/40] fix bug with multiple newlines --- html_text/html_text.py | 22 ++++++++---------- tests/test_html_text.py | 51 ++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 37 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index c1dc717..9530f9b 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -79,25 +79,23 @@ def add_space(text, prev): def add_newline(tag, prev): if prev is None or prev == '\n\n': - return '' + return '', '\n\n' if tag in double_newline_tags: if prev == '\n': - return '\n' - return '\n\n' + return '\n', '\n\n' + return '\n\n', '\n\n' if tag in newline_tags: if prev == '\n': - return '' - return '\n' - return '' + return '', '\n' + return '\n', '\n' + return '', '' def traverse_text_fragments(tree, prev, depth): space = ' ' newline = '' text = '' if guess_page_layout: - newline = add_newline(tree.tag, prev[0]) - if newline: - prev[0] = newline + newline, prev[0] = add_newline(tree.tag, prev[0]) if tree.text: text = _whitespace.sub(' ', tree.text.strip()) if text and guess_punct_space and not add_space(text, prev[0]): @@ -116,10 +114,8 @@ def traverse_text_fragments(tree, prev, depth): yield t if guess_page_layout: - newline = add_newline(tree.tag, prev[0]) - if newline: - prev[0] = newline - + newline, prev[0] = add_newline(tree.tag, prev[0]) + tail = '' if tree.tail and depth != 0: tail = _whitespace.sub(' ', tree.tail.strip()) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 995d673..584e1fc 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -61,31 +61,34 @@ def test_punct_whitespace_preserved(): def test_selector(all_options): - html = '
    text
    more
    and more text
    ' + html = ( + u'textmoreand more text' + ) sel = cleaned_selector(html) assert selector_to_text(sel, **all_options) == 'text more and more text' - subsel = sel.xpath('//div[@id="extract-me"]')[0] + subsel = sel.xpath('//span[@id="extract-me"]')[0] assert selector_to_text(subsel, **all_options) == 'text more' -# -# def test_guess_page_layout(): -# html = (u' title
    text_1.

    text_2 text_3

    ' -# '

    • text_4
    • text_5
    ' -# '

    text_6text_7text_8

    text_9
    ' -# '

    ...text_10

    ') -# assert (extract_text(html, guess_punct_space=False) == ( -# 'title text_1. text_2 text_3 text_4 text_5' -# ' text_6 text_7 text_8 text_9 ...text_10')) -# assert (extract_text( -# html, guess_punct_space=False, guess_page_layout=True) == ( -# 'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' -# '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) -# assert (extract_text( -# html, -# guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' -# ' text_6 text_7 text_8 text_9...text_10')) -# assert (extract_text( -# html, guess_punct_space=True, guess_page_layout=True) == ( -# 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' -# '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) + + +def test_guess_page_layout(): + html = (u' title
    text_1.

    text_2 text_3

    ' + '

    • text_4
    • text_5
    ' + '

    text_6text_7text_8

    text_9
    ' + '

    ...text_10

    ') + assert (extract_text(html, guess_punct_space=False) == ( + 'title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9 ...text_10')) + assert (extract_text( + html, guess_punct_space=False, guess_page_layout=True) == ( + 'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5' + '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10')) + assert (extract_text( + html, + guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5' + ' text_6 text_7 text_8 text_9...text_10')) + assert (extract_text( + html, guess_punct_space=True, guess_page_layout=True) == ( + 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' + '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) From 695b458fa185929fc99611ffedeaeee648121821 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 17:42:11 -0700 Subject: [PATCH 22/40] remove newline --- tests/test_html_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 584e1fc..4a9ef42 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -70,7 +70,6 @@ def test_selector(all_options): assert selector_to_text(subsel, **all_options) == 'text more' - def test_guess_page_layout(): html = (u' title
    text_1.

    text_2 text_3

    ' '

    • text_4
    • text_5
    ' From 03259b9f398898ddf92fd2ff0e297ff3a93349ce Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 17:47:50 -0700 Subject: [PATCH 23/40] add test html without text --- tests/test_html_text.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 4a9ef42..8e93ac5 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -21,6 +21,13 @@ def all_options(request): return request.param +def test_extract_no_text_html(all_options): + html = (u'

    ') + assert extract_text(html, **all_options) == u'' + + def test_extract_text(all_options): html = u'

    Hello, world!' assert extract_text(html, **all_options) == u'Hello, world!' From cba531fe2330c9c14d82ef9ba8a717876c588759 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 18:26:58 -0700 Subject: [PATCH 24/40] fix newline + space bug --- html_text/html_text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 9530f9b..9c9a20a 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -86,9 +86,9 @@ def add_newline(tag, prev): return '\n\n', '\n\n' if tag in newline_tags: if prev == '\n': - return '', '\n' + return '', prev return '\n', '\n' - return '', '' + return '', prev def traverse_text_fragments(tree, prev, depth): space = ' ' From 9811349bd06cf52e3b573a224e83bc60485eeaa2 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 18:27:35 -0700 Subject: [PATCH 25/40] add bad punct test --- tests/test_html_text.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 8e93ac5..83845e4 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -58,6 +58,7 @@ def test_inline_tags_whitespace(all_options): def test_punct_whitespace(): html = u'

    field, and more
    ' assert extract_text(html, guess_punct_space=False) == u'field , and more' + assert extract_text(html, guess_punct_space=True) == u'field, and more' def test_punct_whitespace_preserved(): @@ -67,6 +68,19 @@ def test_punct_whitespace_preserved(): html, guess_punct_space=True) == u'по ле, and , more ! now a (boo)') +def test_bad_punct_whitespace(): + html = (u'
    trees '
    +            '= webstruct'
    +            '.load_trees'
    +            '("train/*.html"'
    +            ')
    ') + assert extract_text( + html, guess_punct_space=False) == ( + u'trees = webstruct . load_trees ( "train/*.html" )') + assert extract_text( + html, guess_punct_space=True) == ( + u'trees = webstruct. load_trees ("train/*.html")') + def test_selector(all_options): html = ( u'textmoreand more text' From d47138cb09013e806df27179ca8e469a8c350501 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 18:27:53 -0700 Subject: [PATCH 26/40] add newline --- tests/test_html_text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 83845e4..f98c58d 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -81,6 +81,7 @@ def test_bad_punct_whitespace(): html, guess_punct_space=True) == ( u'trees = webstruct. load_trees ("train/*.html")') + def test_selector(all_options): html = ( u'textmoreand more text' From 76f9028c5e1c0b1e22561bce5b383e3f906d83be Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Mon, 10 Sep 2018 19:06:51 -0700 Subject: [PATCH 27/40] add tests on real webpages --- tests/test_html_text.py | 14 + ...the Attic | Books to Scrape - Sandbox.html | 361 +++++++++++ ... the Attic | Books to Scrape - Sandbox.txt | 30 + ...00\224 IANA-managed Reserved Domains.html" | 233 +++++++ ...200\224 IANA-managed Reserved Domains.txt" | 105 ++++ .../Scrapinghub Enterprise Solutions.html | 3 + .../Scrapinghub Enterprise Solutions.txt | 230 +++++++ ...\200\224 Webstruct 0.6 documentation.html" | 590 ++++++++++++++++++ ...2\200\224 Webstruct 0.6 documentation.txt" | 214 +++++++ ...\200\224 Webstruct 0.6 documentation.html" | 357 +++++++++++ ...2\200\224 Webstruct 0.6 documentation.txt" | 91 +++ 11 files changed, 2228 insertions(+) create mode 100644 tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html create mode 100644 tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt create mode 100644 "tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html" create mode 100644 "tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt" create mode 100644 tests/test_webpages/Scrapinghub Enterprise Solutions.html create mode 100644 tests/test_webpages/Scrapinghub Enterprise Solutions.txt create mode 100644 "tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html" create mode 100644 "tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt" create mode 100644 "tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html" create mode 100644 "tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt" diff --git a/tests/test_html_text.py b/tests/test_html_text.py index f98c58d..612668d 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- import pytest import lxml +import glob +from pathlib import Path from html_text import (extract_text, parse_html, cleaned_selector, selector_to_text) @@ -113,3 +115,15 @@ def test_guess_page_layout(): html, guess_punct_space=True, guess_page_layout=True) == ( 'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5' '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) + + +def test_webpages(): + webpages = sorted(glob.glob('./test_webpages/*.html')) + extracted = sorted(glob.glob('./test_webpages/*.txt')) + for page, extr in zip(webpages, extracted): + with open(page, 'r', encoding='utf8') as f_in: + html = f_in.read() + with open(extr, 'r', encoding='utf8') as f_in: + expected = f_in.read() + assert (extract_text( + html, guess_punct_space=True, guess_page_layout=True) == expected) diff --git a/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html new file mode 100644 index 0000000..2c2d627 --- /dev/null +++ b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html @@ -0,0 +1,361 @@ + + + + + + + + + + A Light in the Attic | Books to Scrape - Sandbox + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +
    +
    +
    Books to Scrape We love being scraped! +
    + + +
    +
    +
    + + + +
    +
    + + + + + + + + + +
    + +
    + + +
    + + + +
    + +
    + + +
    + +
    + +
    + + +
    + + + + + + + + + + + + + + + +
    + + + +
    + + +

    A Light in the Attic

    + + + + + + + + + + +

    £51.77

    + + +

    + + + In stock (22 available) + +

    + + + + + + + + +

    + + + + + + +   + + +

    + + + +
    + + + + + + + + + + + + + +
    + + +
    + + + +
    +

    Product Description

    +
    +

    It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more

    + + + + +
    +

    Product Information

    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    UPCa897fe39b1053632
    Product TypeBooks
    Price (excl. tax)£51.77
    Price (incl. tax)£51.77
    Tax£0.00
    AvailabilityIn stock (22 available)
    Number of reviews0
    + + + + +
    +
    +
    +
    + + + + + + + + + + + + + + + +
    +
    +
    +
    +
    + + + +
    + + + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt new file mode 100644 index 0000000..4c664a8 --- /dev/null +++ b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt @@ -0,0 +1,30 @@ +A Light in the Attic | Books to Scrape - Sandbox + +Books to Scrape We love being scraped! + +Home +Books +Poetry +A Light in the Attic + +A Light in the Attic + +£51.77 + +In stock (22 available) + +Warning! This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning. + +Product Description + +It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more + +Product Information + +UPC a897fe39b1053632 +Product Type Books +Price (excl. tax) £51.77 +Price (incl. tax) £51.77 +Tax £0.00 +Availability In stock (22 available) +Number of reviews 0 \ No newline at end of file diff --git "a/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html" "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html" new file mode 100644 index 0000000..ccf988f --- /dev/null +++ "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html" @@ -0,0 +1,233 @@ + + + + IANA — IANA-managed Reserved Domains + + + + + + + + + + + + + + + + + + +
    + +
    + +
    + + +
    + + +

    IANA-managed Reserved Domains

    + +

    Certain domains are set aside, and nominally registered to “IANA”, for specific + policy or technical purposes.

    + +

    Example domains

    + +

    As described in RFC 2606 and RFC 6761, + a number of domains such as example.com and example.org + are maintained for documentation purposes. These domains may be used as illustrative + examples in documents without prior coordination with us. They are + not available for registration or transfer.

    + +

    Test IDN top-level domains

    + +

    These domains were temporarily delegated by IANA for the IDN Evaluation being conducted by ICANN.

    + +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    DomainDomain (A-label)LanguageScript
    إختبارXN--KGBECHTVArabicArabic
    آزمایشیXN--HGBK6AJ7F53BBAPersianArabic
    测试XN--0ZWM56DChineseHan (Simplified variant)
    測試XN--G6W251DChineseHan (Traditional variant)
    испытаниеXN--80AKHBYKNJ4FRussianCyrillic
    परीक्षाXN--11B5BS3A9AJ6GHindiDevanagari (Nagari)
    δοκιμήXN--JXALPDLPGreek, Modern (1453-)Greek
    테스트XN--9T4B11YI5AKoreanHangul (Hangŭl, Hangeul)
    טעסטXN--DEBA0ADYiddishHebrew
    テストXN--ZCKZAHJapaneseKatakana
    பரிட்சைXN--HLCJ6AYA9ESC7ATamilTamil
    +
    + +

    Policy-reserved domains

    + +

    We act as both the registrant and registrar for a select number of domains + which have been reserved under policy grounds. These exclusions are + typically indicated in either technical standards (RFC documents), + or contractual limitations.

    + +

    Domains which are described as registered to IANA or ICANN on policy + grounds are not available for registration or transfer, with the exception + of country-name.info domains. These domains are available for release + by the ICANN Governmental Advisory Committee Secretariat.

    + +

    Other Special-Use Domains

    + +

    There is additionally a Special-Use Domain Names registry documenting special-use domains designated by technical standards. For further information, see Special-Use Domain Names (RFC 6761).

    + + +
    + + + + +
    + + + + + + + + + + diff --git "a/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt" "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt" new file mode 100644 index 0000000..1aab856 --- /dev/null +++ "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt" @@ -0,0 +1,105 @@ +IANA — IANA-managed Reserved Domains + +Domains +Numbers +Protocols +About Us + +IANA-managed Reserved Domains + +Certain domains are set aside, and nominally registered to “IANA”, for specific policy or technical purposes. + +Example domains + +As described in RFC 2606 and RFC 6761, a number of domains such as example.com and example.org are maintained for documentation purposes. These domains may be used as illustrative examples in documents without prior coordination with us. They are not available for registration or transfer. + +Test IDN top-level domains + +These domains were temporarily delegated by IANA for the IDN Evaluation being conducted by ICANN. + +Domain Domain (A-label) Language Script +إختبار XN--KGBECHTV Arabic Arabic +آزمایشی XN--HGBK6AJ7F53BBA Persian Arabic +测试 XN--0ZWM56D Chinese Han (Simplified variant) +測試 XN--G6W251D Chinese Han (Traditional variant) +испытание XN--80AKHBYKNJ4F Russian Cyrillic +परीक्षा XN--11B5BS3A9AJ6G Hindi Devanagari (Nagari) +δοκιμή XN--JXALPDLP Greek, Modern (1453-) Greek +테스트 XN--9T4B11YI5A Korean Hangul (Hangŭl, Hangeul) +טעסט XN--DEBA0AD Yiddish Hebrew +テスト XN--ZCKZAH Japanese Katakana +பரிட்சை XN--HLCJ6AYA9ESC7A Tamil Tamil + +Policy-reserved domains + +We act as both the registrant and registrar for a select number of domains which have been reserved under policy grounds. These exclusions are typically indicated in either technical standards (RFC documents), or contractual limitations. + +Domains which are described as registered to IANA or ICANN on policy grounds are not available for registration or transfer, with the exception of country-name.info domains. These domains are available for release by the ICANN Governmental Advisory Committee Secretariat. + +Other Special-Use Domains + +There is additionally a Special-Use Domain Names registry documenting special-use domains designated by technical standards. For further information, see Special-Use Domain Names (RFC 6761). + +Domain Names + +Overview +Root Zone Management + +Overview +Root Database +Hint and Zone Files +Change Requests +Instructions & Guides +Root Servers + +.INT Registry + +Overview +Register/modify an .INT domain +Eligibility + +.ARPA Registry +IDN Practices Repository + +Overview +Submit a table + +Root Key Signing Key (DNSSEC) + +Overview +Trusts Anchors and Keys +Root KSK Ceremonies +Practice Statement +Community Representatives + +Reserved Domains + +Domain Names + +Root Zone Registry +.INT Registry +.ARPA Registry +IDN Repository + +Number Resources + +Abuse Information + +Protocols + +Protocol Registries +Time Zone Database + +About Us + +Presentations +Reports +Performance +Reviews +Excellence +Contact Us + +The IANA functions coordinate the Internet’s globally unique identifiers, and are provided by Public Technical Identifiers, an affiliate of ICANN. + +Privacy Policy +Terms of Service \ No newline at end of file diff --git a/tests/test_webpages/Scrapinghub Enterprise Solutions.html b/tests/test_webpages/Scrapinghub Enterprise Solutions.html new file mode 100644 index 0000000..c9b02f2 --- /dev/null +++ b/tests/test_webpages/Scrapinghub Enterprise Solutions.html @@ -0,0 +1,3 @@ + Scrapinghub Enterprise Solutions

    Web data, hassle-free, for real business needs

    Get a free consultation

    From the world leading experts in web scraping

    Lead generation, competitor & sales intelligence

    Alternative data for finance, equity and market research

    Dark web, law enforcement & compliance

    Staffing, talent sourcing & job market research

    Product aggregation & price monitoring for retail, e-commerce & manufacturers

    Monitoring of ratings and reviews, sentiment analysis & social network intelligence

    Need some advice?

    The best web crawler team

    Authors of the #1 web crawling framework, the world’s most experienced team of engineers will help you get the very best results for your project.

    7 billion pages crawled on our platform
    per month

    We are the authors of the most popular open-source web scraping tools. You can be assured that our services are the best in class.

    100% money-back guarantee

    All your scraping projects are backed by us. Maintenance agreements and enterprise SLAs available to ensure long-term success.

    We scrape the web for:

    \ No newline at end of file diff --git a/tests/test_webpages/Scrapinghub Enterprise Solutions.txt b/tests/test_webpages/Scrapinghub Enterprise Solutions.txt new file mode 100644 index 0000000..d101b43 --- /dev/null +++ b/tests/test_webpages/Scrapinghub Enterprise Solutions.txt @@ -0,0 +1,230 @@ +Scrapinghub Enterprise Solutions + +Enterprise Solutions +Products + +Data on Demand + +Turn web content into useful data for your business + +Crawlera Smart Proxy + +A smart proxy that never gets banned and doesn't need IP rotation + +Professional Services + +The most experienced team from the market leaders in web scraping + +Scrapy Cloud + +Deploy and manage your Scrapy spiders with your web scraping team + +Scrapy Training + +Get your team trained on Scrapy, by the team that create Scrapy itself + +Developer Tools + +Scrapy Cloud + +Deploy and manage your Scrapy spiders with your web scraping team + +Crawlera Smart Proxy + +A smart proxy that never gets banned and doesn't need IP rotation + +Splash + +A full blown browser behind an API, to render pages and execute actions + +Pricing + +Data on Demand + +Crawlera + +Scrapy Cloud + +Splash + +Sign In + +hamburger + +Enterprise Solutions +Products + +Data on Demand + +Turn web content into useful data for your business + +Crawlera Smart Proxy + +A smart proxy that never gets banned and doesn't need IP rotation + +Professional Services + +The most experienced team from the market leaders in web scraping + +Scrapy Cloud + +Deploy and manage your Scrapy spiders with your web scraping team + +Scrapy Training + +Get your team trained on Scrapy, by the team that create Scrapy itself + +Developer Tools + +Scrapy Cloud + +Deploy and manage your Scrapy spiders with your web scraping team + +Crawlera Smart Proxy + +A smart proxy that never gets banned and doesn't need IP rotation + +Splash + +A full blown browser behind an API, to render pages and execute actions + +Pricing + +Data on Demand + +Crawlera + +Scrapy Cloud + +Splash + +Sign In + +Enterprise Solutions + +Complete web scraping services for any size business, from startups to Fortune 100’s + +Tell us about your project + +Web data, hassle-free, for real business needs + +Get a free consultation + +From the world leading experts in web scraping + +Lead generation, competitor & sales intelligence + +Alternative data for finance, equity and market research + +Dark web, law enforcement & compliance + +Staffing, talent sourcing & job market research + +Product aggregation & price monitoring for retail, e-commerce & manufacturers + +Monitoring of ratings and reviews, sentiment analysis & social network intelligence + +Let’s Partner + +Team up with the best web scraping engineers while you stay focused on your business goals + +Quality assurance, enterprise service-level agreements and maintenance plans + +Full access to your project’s code with training and handover + +Money-back guarantee for your project + +Get in touch + +Data on Demand + +Any size scraping project. Data refreshed regularly, reliably and in the form you want + +Accuracy and coverage guarantees + +Scraped data from virtually any number of web pages + +Post processing and automated data crawling updates anytime + +Get in touch + +Data Science + +Enriched data for your business that goes beyond traditional web crawling needs + +Your raw web data post-processed for real insights + +Link data across disparate scraped pages + +Deduce sentiment on a large scale + +Get in touch + +Training + +Learn from the recognised experts in data crawling and scraping to grow your own in-house team + +One-to-one and group training + +Standard introduction to web scraping + +Tailored courses to help you solve very specific business challenges + +Get in touch + +Need some advice? + +The best web crawler team + +Authors of the #1 web crawling framework, the world’s most experienced team of engineers will help you get the very best results for your project. + +7 billion pages crawled on our platform +per month + +We are the authors of the most popular open-source web scraping tools. You can be assured that our services are the best in class. + +100% money-back guarantee + +All your scraping projects are backed by us. Maintenance agreements and enterprise SLAs available to ensure long-term success. + +Ask any question + +We scrape the web for: + +Need web data? + +Contact us + +scrapinghub-letter-logo + +Cuil Greine House + +Ballincollig Commercial Park, Link Road + +Ballincollig, Co. Cork, Ireland + +VAT Number IE 9787078K + +Follow us + +Company + +About us Clients Open Source Contact Jobs Press + +Products + +Data on Demand Proxy Network Professional Services Scrapy Training + +Developers + +Scrapy Cloud Crawlera Splash + +Resources + +Webinars Blog Documentation Support & KB Status Terms of Service Abuse Report Privacy Policy Cookie Policy + +© 2010-2017 Scrapinghub + +Scrapinghub uses cookies to enhance your experience, analyze our website traffic, and share information with our analytics partners. By using this website you consent to our use of cookies. For more information, please refer to our Cookie Policy. + +I Agree \ No newline at end of file diff --git "a/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html" "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html" new file mode 100644 index 0000000..a174bca --- /dev/null +++ "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html" @@ -0,0 +1,590 @@ + + + + + + + + + + + Tutorial — Webstruct 0.6 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    +
    +
    + +
    +

    Tutorial

    +

    This tutorial assumes you are familiar with machine learning.

    +
    +

    Get annotated data

    +

    First, you need the training/development data. We suggest to use +WebAnnotator Firefox extension to annotate HTML pages.

    +

    Recommended WebAnnotator options:

    +_images/wa-options.png +

    Pro tip - enable WebAnnotator toolbar buttons:

    +_images/wa-buttons.png +

    Follow WebAnnotator manual +to define named entities and annotate some web pages (nested WebAnnotator +entities are not supported). Use “Save as..” menu item or “Save as” +toolbar button to save the results; don’t use “Export as”.

    +

    After that you can load annotated webpages as lxml trees:

    +
    import webstruct
    +trees = webstruct.load_trees("train/*.html", webstruct.WebAnnotatorLoader())
    +
    +
    +

    See HTML Loaders for more info. +GATE annotation format is also supported.

    +
    +
    +

    From HTML to Tokens

    +

    To convert HTML trees to a format suitable for sequence prediction algorithm +(like CRF, MEMM or Structured Perceptron) the following approach is used:

    +
      +
    1. Text is extracted from HTML and split into tokens.
    2. +
    3. For each token a special HtmlToken instance is created. It +contains information not only about the text token itself, but also about +its position in HTML tree.
    4. +
    +

    A single HTML page corresponds to a single input sequence +(a list of HtmlTokens). For training/testing data +(where webpages are already annotated) there is also a list of labels for +each webpage, a label per HtmlToken.

    +

    To transform HTML trees into labels and HTML tokens +use HtmlTokenizer.

    +
    html_tokenizer = webstruct.HtmlTokenizer()
    +X, y = html_tokenizer.tokenize(trees)
    +
    +
    +

    Input trees should be loaded by one of the WebStruct loaders. +For consistency, for each tree (even if it is loaded from raw unannotated html) +HtmlTokenizer extracts two arrays: a list of HtmlToken +instances and a list of tags encoded using IOB2 encoding +(also known as BIO encoding). So in our example X is a list of +lists of HtmlToken instances, and y is a list of lists +of strings.

    +
    +
    +

    Feature Extraction

    +

    For supervised machine learning algorithms to work we need to extract +features.

    +

    In WebStruct feature vectors are Python dicts +{"feature_name": "feature_value"}; a dict is computed for +each HTML token. How to convert these dicts into representation required +by a sequence labelling toolkit depends on a toolkit used; we will cover +that later.

    +

    To compute feature dicts we’ll use HtmlFeatureExtractor.

    +

    First, define your feature functions. A feature function should take +an HtmlToken instance and return a feature dict; +feature dicts from individual feature functions will be merged +into the final feature dict for a token. Feature functions can ask questions +about token itself, its neighbours (in the same HTML element), +its position in HTML.

    +
    +

    Note

    +

    WebStruct supports other kind of feature functions that work on multiple +tokens; we don’t cover them in this tutorial.

    +
    +

    There are predefined feature functions in webstruct.features, +but for this tutorial let’s create some functions ourselves:

    +
    def token_identity(html_token):
    +    return {'token': html_token.token}
    +
    +def token_isupper(html_token):
    +    return {'isupper': html_token.token.isupper()}
    +
    +def parent_tag(html_token):
    +    return {'parent_tag': html_token.parent.tag}
    +
    +def border_at_left(html_token):
    +    return {'border_at_left': html_token.index == 0}
    +
    +
    +

    Next, create HtmlFeatureExtractor:

    +
    feature_extractor = HtmlFeatureExtractor(
    +    token_features = [
    +        token_identity,
    +        token_isupper,
    +        parent_tag,
    +        border_at_left
    +    ]
    +)
    +
    +
    +

    and use it to extract feature dicts:

    +
    features = feature_extractor.fit_transform(X)
    +
    +
    +

    See Feature Extraction for more info about HTML tokenization and +feature extraction.

    +
    +
    +

    Using a Sequence Labelling Toolkit

    +

    WebStruct doesn’t provide a CRF or Structured Perceptron implementation; +learning and prediction is supposed to be handled by an external +sequence labelling toolkit like CRFSuite, Wapiti or seqlearn.

    +

    Once feature dicts are extracted from HTML you should convert them to +a format required by your sequence labelling tooklit and use this toolkit +to train a model and do the prediction. For example, you may use +DictVectorizer from scikit-learn to convert feature dicts +into seqlearn input format.

    +

    We’ll use CRFSuite in this tutorial.

    +

    WebStruct provides some helpers for CRFSuite sequence labelling toolkit. +To use CRFSuite with WebStruct, you need

    +
      +
    • sklearn-crfsuite package (which depends on python-crfsuite and sklearn)
    • +
    +
    +

    Defining a Model

    +

    Basic way to define CRF model is the following:

    +
    model = webstruct.create_crfsuite_pipeline(
    +        token_features=[token_identity, token_isupper, parent_tag, border_at_left],
    +        verbose=True
    +    )
    +
    +
    +

    First create_crfsuite_pipeline() argument is a list of feature functions which will be used for training. +verbose is a boolean parameter enabling verbose output of various training information; +check sklearn-crfsuite API reference +for available options.

    +

    Under the hood create_crfsuite_pipeline() creates a +sklearn.pipeline.Pipeline with an HtmlFeatureExtractor instance +followed by sklearn_crfsuite.CRF instance. The example above is just a shortcut +for the following:

    +
    model = Pipeline([
    +    ('fe', HtmlFeatureExtractor(
    +        token_features = [
    +            token_identity,
    +            token_isupper,
    +            parent_tag,
    +            border_at_left,
    +        ]
    +    )),
    +    ('crf', sklearn_crfsuite.CRF(
    +        verbose=True
    +    )),
    +])
    +
    +
    +
    +
    +

    Training

    +

    To train a model use its fit method:

    +
    model.fit(X, y)
    +
    +
    +

    X and y are return values of HtmlTokenizer.tokenize() +(a list of lists of HtmlToken instances and a list of +lists of string IOB labels).

    +

    If you use sklearn_crfsuite.CRF directly then train it using +CRF.fit() method. It accepts 2 lists: a list of lists of +feature dicts, and a list of lists of tags:

    +
    model.fit(features, y)
    +
    +
    +
    +
    +
    +

    Named Entity Recognition

    +

    Once you got a trained model you can use it to extract entities +from unseen (unannotated) webpages. First, get some binary HTML data:

    +
    >>> import urllib2
    +>>> html = urllib2.urlopen("http://scrapinghub.com/contact").read()
    +
    +
    +

    Then create a NER instance initialized with a trained model:

    +
    >>> ner = webstruct.NER(model)
    +
    +
    +

    The model must provide a predict method that extracts features +from HTML tokens and predicts labels for these tokens. A pipeline created with +create_crfsuite_pipeline() function fits this definition.

    +

    Finally, use NER.extract() method to extract entities:

    +
    >>> ner.extract(html)
    +[('Scrapinghub', 'ORG'), ..., ('Iturriaga 3429 ap. 1', 'STREET'), ...]
    +
    +
    +

    Generally, the steps are:

    +
      +
    1. Load data using HtmlLoader loader. If a custom HTML cleaner +was used for loading training data make sure to apply it here as well.
    2. +
    3. Use the same html_tokenizer as used for training to extract HTML tokens +from loaded trees. All labels would be “O” when using HtmlLoader +loader - y can be discarded.
    4. +
    5. Use the same feature_extractor as used for training to extract +features.
    6. +
    7. Run your_crf.predict() method (e.g. CRF.predict()) +on features extracted in (3) to get the prediction - a list of IOB2-encoded +tags for each input document.
    8. +
    9. Build entities from input tokens based on predicted tags +(check IobEncoder.group() and smart_join()).
    10. +
    11. Split entities into groups (optional). One way to do it is to use +webstruct.grouping.
    12. +
    +

    NER helper class combines HTML loading, HTML tokenization, +feature extraction, CRF model, entity building and grouping.

    +
    +
    +

    Entity Grouping

    +

    Detecting entities on their own is not always enough; in many cases +what is wanted is to find the relationship between them. For example, +“street_name/STREET city_name/CITY zipcode_number/ZIPCODE +form an address”, or “phone/TEL is a phone of person/PER”.

    +

    The first approximation is to say that all entities from a single webpage +are related. For example, if we have extracted some organizaion/ORG and some +phone/TEL from a single webpage we may assume that the phone +is a contact phone of the organization.

    +

    Sometimes there are several “entity groups” on a webpage. If a page +contains contact phones of several persons or several business locations +it is better to split all entities into groups of related +entities - “person name + his/her phone(s)” or “address”.

    +

    WebStruct provides an unsupervised algorithm +for extracting such entity groups. Algorithm prefers to build +large groups without entities of duplicate types; if a split is needed +algorithm tries to split at points where distance between entities is larger.

    +

    Use NER.extract_groups() to extract groups of entities:

    +
    >>> ner.extract_groups(html)
    +[[...], ... [('Iturriaga 3429 ap. 1', 'STREET'), ('Montevideo', 'CITY'), ...]]
    +
    +
    +

    Sometimes it is better to allow some entity types to appear +multuple times in a group. For example, a person (PER entity) may have +several contact phones and faxes (TEL and FAX entities) - we should penalize +groups with multiple PERs, but multiple TELs and FAXes are fine. +Use dont_penalize argument if you want to allow some entity types +to appear multiple times in a group:

    +
    ner.extract_groups(html, dont_penalize={'TEL', 'FAX'})
    +
    +
    +

    The simple algorithm WebStruct provides is by no means a general solution +to relation detection, but give it a try - maybe it is enough for your task.

    +
    +
    +

    Model Development

    +

    To develop the model you need to choose the learning algorithm, +features, hyperparameters, etc. To do that you need scoring metrics, +cross-validation utilities and tools for debugging what classifier learned. +WebStruct helps in the following way:

    +
      +
    1. Pipeline created by create_crfsuite_pipeline() is compatible with +cross-validation and grid search utilities from scikit-learn; +use them to select model parameters and check the quality.

      +

      One limitation of create_crfsuite_pipeline() is that n_jobs +in scikit-learn functions and classes should be 1, but other than that +WebStruct objects should work fine with scikit-learn. Just keep in mind +that for WebStruct an “observation” is a document, not an individual token, +and a “label” is a sequence of labels for a document, not an individual +IOB tag.

      +
    2. +
    3. There is webstruct.metrics module with a couple of metrics useful +for sequence classification.

      +
    4. +
    +

    To debug what CRFSuite learned you could use eli5 library. With eli5 it would be two calls to +eli5.explain_weights() and eli5.format_as_html() with sklearn_crfsuite.CRF instance as argument. +As a result you will get transitions and feature weights.

    +
    +
    + + +
    + +
    + + +
    +
    + +
    + +
    + + +
    + + Read the Docs + v: latest + + +
    +
    +
    Versions
    + +
    latest
    + +
    stable
    + +
    0.6
    + +
    0.5
    + +
    0.4.1
    + +
    0.4
    + +
    0.3
    + +
    0.2
    + +
    +
    +
    Downloads
    + +
    pdf
    + +
    htmlzip
    + +
    epub
    + +
    +
    +
    On Read the Docs
    +
    + Project Home +
    +
    + Builds +
    +
    +
    + Free document hosting provided by Read the Docs. + +
    +
    + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git "a/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt" "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt" new file mode 100644 index 0000000..0431385 --- /dev/null +++ "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt" @@ -0,0 +1,214 @@ +Tutorial — Webstruct 0.6 documentation + +Webstruct +latest + +Webstruct +Tutorial + +Get annotated data +From HTML to Tokens +Feature Extraction +Using a Sequence Labelling Toolkit + +Defining a Model +Training + +Named Entity Recognition +Entity Grouping +Model Development + +Reference +Changes + +Webstruct + +Docs » +Tutorial +Edit on GitHub + +Tutorial ¶ + +This tutorial assumes you are familiar with machine learning. + +Get annotated data ¶ + +First, you need the training/development data. We suggest to use WebAnnotator Firefox extension to annotate HTML pages. + +Recommended WebAnnotator options: + +Pro tip - enable WebAnnotator toolbar buttons: + +Follow WebAnnotator manual to define named entities and annotate some web pages (nested WebAnnotator entities are not supported). Use “Save as..” menu item or “Save as” toolbar button to save the results; don’t use “Export as”. + +After that you can load annotated webpages as lxml trees: + +import webstruct trees = webstruct. load_trees ("train/*.html", webstruct. WebAnnotatorLoader ()) + +See HTML Loaders for more info. GATE annotation format is also supported. + +From HTML to Tokens ¶ + +To convert HTML trees to a format suitable for sequence prediction algorithm (like CRF, MEMM or Structured Perceptron) the following approach is used: + +Text is extracted from HTML and split into tokens. +For each token a special HtmlToken instance is created. It contains information not only about the text token itself, but also about its position in HTML tree. + +A single HTML page corresponds to a single input sequence (a list of HtmlTokens). For training/testing data (where webpages are already annotated) there is also a list of labels for each webpage, a label per HtmlToken. + +To transform HTML trees into labels and HTML tokens use HtmlTokenizer. + +html_tokenizer = webstruct. HtmlTokenizer () X, y = html_tokenizer. tokenize (trees) + +Input trees should be loaded by one of the WebStruct loaders. For consistency, for each tree (even if it is loaded from raw unannotated html) HtmlTokenizer extracts two arrays: a list of HtmlToken instances and a list of tags encoded using IOB2 encoding (also known as BIO encoding). So in our example X is a list of lists of HtmlToken instances, and y is a list of lists of strings. + +Feature Extraction ¶ + +For supervised machine learning algorithms to work we need to extract features. + +In WebStruct feature vectors are Python dicts {"feature_name":"feature_value"}; a dict is computed for each HTML token. How to convert these dicts into representation required by a sequence labelling toolkit depends on a toolkit used; we will cover that later. + +To compute feature dicts we’ll use HtmlFeatureExtractor. + +First, define your feature functions. A feature function should take an HtmlToken instance and return a feature dict; feature dicts from individual feature functions will be merged into the final feature dict for a token. Feature functions can ask questions about token itself, its neighbours (in the same HTML element), its position in HTML. + +Note + +WebStruct supports other kind of feature functions that work on multiple tokens; we don’t cover them in this tutorial. + +There are predefined feature functions in webstruct.features, but for this tutorial let’s create some functions ourselves: + +def token_identity (html_token): return { 'token': html_token. token } def token_isupper (html_token): return { 'isupper': html_token. token. isupper ()} def parent_tag (html_token): return { 'parent_tag': html_token. parent. tag } def border_at_left (html_token): return { 'border_at_left': html_token. index == 0 } + +Next, create HtmlFeatureExtractor: + +feature_extractor = HtmlFeatureExtractor (token_features = [ token_identity, token_isupper, parent_tag, border_at_left ]) + +and use it to extract feature dicts: + +features = feature_extractor. fit_transform (X) + +See Feature Extraction for more info about HTML tokenization and feature extraction. + +Using a Sequence Labelling Toolkit ¶ + +WebStruct doesn’t provide a CRF or Structured Perceptron implementation; learning and prediction is supposed to be handled by an external sequence labelling toolkit like CRFSuite, Wapiti or seqlearn. + +Once feature dicts are extracted from HTML you should convert them to a format required by your sequence labelling tooklit and use this toolkit to train a model and do the prediction. For example, you may use DictVectorizer from scikit-learn to convert feature dicts into seqlearn input format. + +We’ll use CRFSuite in this tutorial. + +WebStruct provides some helpers for CRFSuite sequence labelling toolkit. To use CRFSuite with WebStruct, you need + +sklearn-crfsuite package (which depends on python-crfsuite and sklearn) + +Defining a Model ¶ + +Basic way to define CRF model is the following: + +model = webstruct. create_crfsuite_pipeline (token_features = [ token_identity, token_isupper, parent_tag, border_at_left ], verbose = True) + +First create_crfsuite_pipeline() argument is a list of feature functions which will be used for training. verbose is a boolean parameter enabling verbose output of various training information; check sklearn-crfsuite API reference for available options. + +Under the hood create_crfsuite_pipeline() creates a sklearn.pipeline.Pipeline with an HtmlFeatureExtractor instance followed by sklearn_crfsuite.CRF instance. The example above is just a shortcut for the following: + +model = Pipeline ([ ('fe', HtmlFeatureExtractor (token_features = [ token_identity, token_isupper, parent_tag, border_at_left, ])), ('crf', sklearn_crfsuite. CRF (verbose = True)), ]) + +Training ¶ + +To train a model use its fit method: + +model. fit (X, y) + +X and y are return values of HtmlTokenizer.tokenize() (a list of lists of HtmlToken instances and a list of lists of string IOB labels). + +If you use sklearn_crfsuite.CRF directly then train it using CRF.fit() method. It accepts 2 lists: a list of lists of feature dicts, and a list of lists of tags: + +model. fit (features, y) + +Named Entity Recognition ¶ + +Once you got a trained model you can use it to extract entities from unseen (unannotated) webpages. First, get some binary HTML data: + +>>> import urllib2 >>> html = urllib2. urlopen ("http://scrapinghub.com/contact"). read () + +Then create a NER instance initialized with a trained model: + +>>> ner = webstruct. NER (model) + +The model must provide a predict method that extracts features from HTML tokens and predicts labels for these tokens. A pipeline created with create_crfsuite_pipeline() function fits this definition. + +Finally, use NER.extract() method to extract entities: + +>>> ner. extract (html) [('Scrapinghub', 'ORG'), ..., ('Iturriaga 3429 ap. 1', 'STREET'), ...] + +Generally, the steps are: + +Load data using HtmlLoader loader. If a custom HTML cleaner was used for loading training data make sure to apply it here as well. +Use the same html_tokenizer as used for training to extract HTML tokens from loaded trees. All labels would be “O” when using HtmlLoader loader - y can be discarded. +Use the same feature_extractor as used for training to extract features. +Run your_crf.predict() method (e.g. CRF.predict()) on features extracted in (3) to get the prediction - a list of IOB2-encoded tags for each input document. +Build entities from input tokens based on predicted tags (check IobEncoder.group() and smart_join()). +Split entities into groups (optional). One way to do it is to use webstruct.grouping. + +NER helper class combines HTML loading, HTML tokenization, feature extraction, CRF model, entity building and grouping. + +Entity Grouping ¶ + +Detecting entities on their own is not always enough; in many cases what is wanted is to find the relationship between them. For example, “ street_name/STREET city_name/CITY zipcode_number/ZIPCODE form an address”, or “ phone/TEL is a phone of person/PER ”. + +The first approximation is to say that all entities from a single webpage are related. For example, if we have extracted some organizaion/ORG and some phone/TEL from a single webpage we may assume that the phone is a contact phone of the organization. + +Sometimes there are several “entity groups” on a webpage. If a page contains contact phones of several persons or several business locations it is better to split all entities into groups of related entities - “person name + his/her phone(s)” or “address”. + +WebStruct provides an unsupervised algorithm for extracting such entity groups. Algorithm prefers to build large groups without entities of duplicate types; if a split is needed algorithm tries to split at points where distance between entities is larger. + +Use NER.extract_groups() to extract groups of entities: + +>>> ner. extract_groups (html) [[...], ... [('Iturriaga 3429 ap. 1', 'STREET'), ('Montevideo', 'CITY'), ...]] + +Sometimes it is better to allow some entity types to appear multuple times in a group. For example, a person (PER entity) may have several contact phones and faxes (TEL and FAX entities) - we should penalize groups with multiple PERs, but multiple TELs and FAXes are fine. Use dont_penalize argument if you want to allow some entity types to appear multiple times in a group: + +ner. extract_groups (html, dont_penalize = { 'TEL', 'FAX' }) + +The simple algorithm WebStruct provides is by no means a general solution to relation detection, but give it a try - maybe it is enough for your task. + +Model Development ¶ + +To develop the model you need to choose the learning algorithm, features, hyperparameters, etc. To do that you need scoring metrics, cross-validation utilities and tools for debugging what classifier learned. WebStruct helps in the following way: + +Pipeline created by create_crfsuite_pipeline() is compatible with cross-validation and grid search utilities from scikit-learn; use them to select model parameters and check the quality. + +One limitation of create_crfsuite_pipeline() is that n_jobs in scikit-learn functions and classes should be 1, but other than that WebStruct objects should work fine with scikit-learn. Just keep in mind that for WebStruct an “observation” is a document, not an individual token, and a “label” is a sequence of labels for a document, not an individual IOB tag. + +There is webstruct.metrics module with a couple of metrics useful for sequence classification. + +To debug what CRFSuite learned you could use eli5 library. With eli5 it would be two calls to eli5.explain_weights() and eli5.format_as_html() with sklearn_crfsuite.CRF instance as argument. As a result you will get transitions and feature weights. + +Next Previous + +© Copyright 2014-2017, Scrapinghub Inc.. Revision 9e461566. + +Built with Sphinx using a theme provided by Read the Docs. +Read the Docs v: latest + +Versions +latest +stable +0.6 +0.5 +0.4.1 +0.4 +0.3 +0.2 + +Downloads +pdf +htmlzip +epub + +On Read the Docs +Project Home +Builds + +Free document hosting provided by Read the Docs. \ No newline at end of file diff --git "a/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html" "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html" new file mode 100644 index 0000000..1fbcb65 --- /dev/null +++ "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html" @@ -0,0 +1,357 @@ + + + + + + + + + + + Webstruct — Webstruct 0.6 documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    + + + + +
    + + + + + +
    + +
    + + + + + + + + + + + + + + + + + +
    + + + + +
    +
    +
    +
    + +
    +

    Webstruct

    +

    Webstruct is a library for creating statistical NER systems that work +on HTML data, i.e. a library for building tools that extract named +entities (addresses, organization names, open hours, etc) from webpages.

    +

    Contents:

    + +
    +
    +

    Indices and tables

    + +
    + + +
    + +
    +
    + + + + +
    + +
    +

    + © Copyright 2014-2017, Scrapinghub Inc.. + + + Revision 9e461566. + + + +

    +
    + Built with Sphinx using a theme provided by Read the Docs. + +
    + +
    +
    + +
    + +
    + + +
    + + Read the Docs + v: latest + + +
    +
    +
    Versions
    + +
    latest
    + +
    stable
    + +
    0.6
    + +
    0.5
    + +
    0.4.1
    + +
    0.4
    + +
    0.3
    + +
    0.2
    + +
    +
    +
    Downloads
    + +
    pdf
    + +
    htmlzip
    + +
    epub
    + +
    +
    +
    On Read the Docs
    +
    + Project Home +
    +
    + Builds +
    +
    +
    + Free document hosting provided by Read the Docs. + +
    +
    + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git "a/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt" "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt" new file mode 100644 index 0000000..6297ee9 --- /dev/null +++ "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt" @@ -0,0 +1,91 @@ +Webstruct — Webstruct 0.6 documentation + +Webstruct +latest + +Webstruct +Tutorial +Reference +Changes + +Webstruct + +Docs » +Webstruct +Edit on GitHub + +Webstruct ¶ + +Webstruct is a library for creating statistical NER systems that work on HTML data, i.e. a library for building tools that extract named entities (addresses, organization names, open hours, etc) from webpages. + +Contents: + +Webstruct + +Overview +Installation + +Tutorial + +Get annotated data +From HTML to Tokens +Feature Extraction +Using a Sequence Labelling Toolkit +Named Entity Recognition +Entity Grouping +Model Development + +Reference + +HTML Loaders +Feature Extraction +Model Creation Helpers +Metrics +Entity Grouping +Wapiti Helpers +CRFsuite Helpers +WebAnnotator Utilities +BaseSequenceClassifier +Miscellaneous + +Changes + +0.6 (2017-12-29) +0.5 (2017-05-10) +0.4.1 (2016-11-28) +0.4 (2016-11-26) +0.3 (2016-09-19) + +Indices and tables ¶ + +Index +Module Index +Search Page + +Next + +© Copyright 2014-2017, Scrapinghub Inc.. Revision 9e461566. + +Built with Sphinx using a theme provided by Read the Docs. +Read the Docs v: latest + +Versions +latest +stable +0.6 +0.5 +0.4.1 +0.4 +0.3 +0.2 + +Downloads +pdf +htmlzip +epub + +On Read the Docs +Project Home +Builds + +Free document hosting provided by Read the Docs. \ No newline at end of file From 05c77023804bdad5c8c6476af86ede466ef49bb8 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Tue, 11 Sep 2018 12:15:10 -0700 Subject: [PATCH 28/40] tests to hopefully make codecov happy --- tests/test_html_text.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 612668d..8b0c73d 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -31,7 +31,8 @@ def test_extract_no_text_html(all_options): def test_extract_text(all_options): - html = u'

    Hello, world!' + html = (u'' + '

    Hello, world!') assert extract_text(html, **all_options) == u'Hello, world!' @@ -47,7 +48,8 @@ def test_empty(all_options): def test_extract_text_from_tree(all_options): - html = u'

    Hello, world!' + html = (u'' + '

    Hello, world!') tree = parse_html(html) assert extract_text(tree, **all_options) == u'Hello, world!' @@ -85,9 +87,8 @@ def test_bad_punct_whitespace(): def test_selector(all_options): - html = ( - u'textmoreand more text' - ) + html = (u'textmore' + 'and more text') sel = cleaned_selector(html) assert selector_to_text(sel, **all_options) == 'text more and more text' subsel = sel.xpath('//span[@id="extract-me"]')[0] @@ -117,6 +118,12 @@ def test_guess_page_layout(): '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10')) +def test_adjust_newline(): + html = u'

    text 1

    text 2

    ' + assert (extract_text(html, guess_punct_space=True, + guess_page_layout=True) == ('text 1\n\ntext 2')) + + def test_webpages(): webpages = sorted(glob.glob('./test_webpages/*.html')) extracted = sorted(glob.glob('./test_webpages/*.txt')) From 4505e24c577d6e071761d82f508b1fca10ec7085 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Tue, 11 Sep 2018 12:51:27 -0700 Subject: [PATCH 29/40] remove pathlib import --- tests/test_html_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 8b0c73d..db78a1e 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -2,7 +2,6 @@ import pytest import lxml import glob -from pathlib import Path from html_text import (extract_text, parse_html, cleaned_selector, selector_to_text) From a27e4c8ba750c4b4df1e0c7f32fbcd2c7c3e8a29 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Tue, 11 Sep 2018 12:52:31 -0700 Subject: [PATCH 30/40] fix test --- tests/test_html_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index db78a1e..86c32fe 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -118,7 +118,7 @@ def test_guess_page_layout(): def test_adjust_newline(): - html = u'
    text 1

    text 2

    ' + html = u'
    text 1

    text 2

    ' assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) == ('text 1\n\ntext 2')) From b926c8cfd43f6813d29a17b80a554cac86d73da8 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Tue, 11 Sep 2018 18:00:21 -0700 Subject: [PATCH 31/40] remove space --- html_text/html_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 9c9a20a..d391a77 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -115,7 +115,7 @@ def traverse_text_fragments(tree, prev, depth): if guess_page_layout: newline, prev[0] = add_newline(tree.tag, prev[0]) - + tail = '' if tree.tail and depth != 0: tail = _whitespace.sub(' ', tree.tail.strip()) From 73f49ad6fa6fc27cefad2b146859e84b8b6fa601 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 16:45:47 -0700 Subject: [PATCH 32/40] handle list of selectors --- html_text/html_text.py | 25 ++++++++++++++++++------- tests/test_html_text.py | 10 ++++++---- 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index d391a77..314ccb5 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -139,10 +139,20 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): See html_text.extract_text docstring for description of the approach and options. """ - return _html_to_text( - sel.root, - guess_punct_space=guess_punct_space, - guess_page_layout=guess_page_layout) + if isinstance(sel, list): + # if selecting a specific xpath + text = [ + _html_to_text( + t.root, + guess_punct_space=guess_punct_space, + guess_page_layout=guess_page_layout) for t in sel + ] + return ' '.join(text) + else: + return _html_to_text( + sel.root, + guess_punct_space=guess_punct_space, + guess_page_layout=guess_page_layout) def cleaned_selector(html): @@ -173,9 +183,10 @@ def extract_text(html, for punctuation. This has a slight (around 10%) performance overhead and is just a heuristic. - When guess_page_layout is True (default is False), a newline is added after - NEWLINE_TAGS and two newlines after DOUBLE_NEWLINE_TAGS. This heuristic - makes the extracted text more similar to how it looks like in the browser. + When guess_page_layout is True (default is False), a newline is added + before and after NEWLINE_TAGS and two newlines are added before and after + DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text more similar + to how it is rendered in the browser. NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized. diff --git a/tests/test_html_text.py b/tests/test_html_text.py index 86c32fe..c461853 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -86,12 +86,14 @@ def test_bad_punct_whitespace(): def test_selector(all_options): - html = (u'textmore' - 'and more text') + html = (u'textmore' + 'and more text and some more') sel = cleaned_selector(html) - assert selector_to_text(sel, **all_options) == 'text more and more text' - subsel = sel.xpath('//span[@id="extract-me"]')[0] + assert selector_to_text(sel, **all_options) == 'text more and more text and some more' + subsel = sel.xpath('//span[@id="extract-me"]') assert selector_to_text(subsel, **all_options) == 'text more' + subsel = sel.xpath('//a') + assert selector_to_text(subsel, **all_options) == 'more and some more' def test_guess_page_layout(): From 15d22e050a4d1f5ec06d054a2fb6535c4b087d03 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 16:48:20 -0700 Subject: [PATCH 33/40] a list of selectors returns a list of texts --- html_text/html_text.py | 2 +- tests/test_html_text.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 314ccb5..a298d4a 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -147,7 +147,7 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout) for t in sel ] - return ' '.join(text) + return text else: return _html_to_text( sel.root, diff --git a/tests/test_html_text.py b/tests/test_html_text.py index c461853..fbb892d 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -91,9 +91,9 @@ def test_selector(all_options): sel = cleaned_selector(html) assert selector_to_text(sel, **all_options) == 'text more and more text and some more' subsel = sel.xpath('//span[@id="extract-me"]') - assert selector_to_text(subsel, **all_options) == 'text more' + assert selector_to_text(subsel, **all_options) == ['text more'] subsel = sel.xpath('//a') - assert selector_to_text(subsel, **all_options) == 'more and some more' + assert selector_to_text(subsel, **all_options) == ['more', 'and some more'] def test_guess_page_layout(): From 8f68b2c99768fd1f9ca7b02f7a72be2901d20741 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 17:02:38 -0700 Subject: [PATCH 34/40] selectors_to_text add to res only if something is extracted --- README.rst | 29 ++++++++++++++++++++++------- html_text/html_text.py | 10 ++++++---- tests/test_html_text.py | 4 +++- 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index cac5424..b5020b5 100644 --- a/README.rst +++ b/README.rst @@ -26,9 +26,10 @@ or ``.get_text()`` from Beautiful Soup? Text extracted with ``html_text`` does not contain inline styles, javascript, comments and other text that is not normally visible to the users. It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``, -adding spaces around inline elements too -(which are often used as block elements in html markup), -and tries to avoid adding extra spaces for punctuation. +adding spaces around inline elements (which are often used as block +elements in html markup), tries to avoid adding extra spaces for punctuation and +can add newlines so that the output text looks like how it is rendered in +browsers. Apart from just getting text from the page (e.g. for display or search), one intended usage of this library is for machine learning (feature extraction). @@ -56,18 +57,32 @@ Usage Extract text from HTML:: >>> import html_text - >>> text = html_text.extract_text(u'

    Hey

    ') - u'Hey' + >>> text = html_text.extract_text(u'

    Hello

    world!') + u'Hello world!' + + >>> text = html_text.extract_text(u'

    Hello

    world!', guess_page_layout=True) + u'Hello + world!' You can also pass already parsed ``lxml.html.HtmlElement``: >>> import html_text - >>> tree = html_text.parse_html(u'

    Hey

    ') + >>> tree = html_text.parse_html(u'

    Hello

    world!') >>> text = html_text.extract_text(tree) - u'Hey' + u'Hello world!' + +Or define a selector to extract text only from specific elements, this will +return a list of strings of text, one for each element: + + >>> import html_text + >>> sel = html_text.cleaned_selector(u'

    Hello

    world!') + >>> subsel = sel.xpath('//h1') + >>> text = html_text.selector_to_text(subsel) + [u'Hello'] Passed html will be first cleaned from invisible non-text content such as styles, and then text would be extracted. + Two functions that do it are ``html_text.cleaned_selector`` and ``html_text.selector_to_text``: diff --git a/html_text/html_text.py b/html_text/html_text.py index a298d4a..5a3037d 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -141,12 +141,14 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): """ if isinstance(sel, list): # if selecting a specific xpath - text = [ - _html_to_text( + text = [] + for t in sel: + extracted = _html_to_text( t.root, guess_punct_space=guess_punct_space, - guess_page_layout=guess_page_layout) for t in sel - ] + guess_page_layout=guess_page_layout) + if extracted: + text.append(extracted) return text else: return _html_to_text( diff --git a/tests/test_html_text.py b/tests/test_html_text.py index fbb892d..b01329c 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -87,13 +87,15 @@ def test_bad_punct_whitespace(): def test_selector(all_options): html = (u'textmore' - 'and more text and some more') + 'and more text and some more ') sel = cleaned_selector(html) assert selector_to_text(sel, **all_options) == 'text more and more text and some more' subsel = sel.xpath('//span[@id="extract-me"]') assert selector_to_text(subsel, **all_options) == ['text more'] subsel = sel.xpath('//a') assert selector_to_text(subsel, **all_options) == ['more', 'and some more'] + subsel = sel.xpath('//a[@id="extract-me"]') + assert selector_to_text(subsel, **all_options) == [] def test_guess_page_layout(): From cf02b940b8d5b32daa353fd73bfd485d14afd3e6 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 17:03:53 -0700 Subject: [PATCH 35/40] selectors_to_text merge results as in previous implementation --- html_text/html_text.py | 2 +- tests/test_html_text.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index 5a3037d..cb64058 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -149,7 +149,7 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): guess_page_layout=guess_page_layout) if extracted: text.append(extracted) - return text + return ' '.join(text) else: return _html_to_text( sel.root, diff --git a/tests/test_html_text.py b/tests/test_html_text.py index b01329c..eb8cacf 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -91,11 +91,11 @@ def test_selector(all_options): sel = cleaned_selector(html) assert selector_to_text(sel, **all_options) == 'text more and more text and some more' subsel = sel.xpath('//span[@id="extract-me"]') - assert selector_to_text(subsel, **all_options) == ['text more'] + assert selector_to_text(subsel, **all_options) == 'text more' subsel = sel.xpath('//a') - assert selector_to_text(subsel, **all_options) == ['more', 'and some more'] + assert selector_to_text(subsel, **all_options) == 'more and some more' subsel = sel.xpath('//a[@id="extract-me"]') - assert selector_to_text(subsel, **all_options) == [] + assert selector_to_text(subsel, **all_options) == '' def test_guess_page_layout(): From 7aec8d2ae0c044996f8549d307a67c97ce71479e Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 17:11:50 -0700 Subject: [PATCH 36/40] update readme --- README.rst | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index b5020b5..77457cc 100644 --- a/README.rst +++ b/README.rst @@ -71,17 +71,18 @@ You can also pass already parsed ``lxml.html.HtmlElement``: >>> text = html_text.extract_text(tree) u'Hello world!' -Or define a selector to extract text only from specific elements, this will -return a list of strings of text, one for each element: +Or define a selector to extract text only from specific elements: >>> import html_text >>> sel = html_text.cleaned_selector(u'

    Hello

    world!') >>> subsel = sel.xpath('//h1') >>> text = html_text.selector_to_text(subsel) - [u'Hello'] + u'Hello' Passed html will be first cleaned from invisible non-text content such as styles, and then text would be extracted. +NB Selectors are not cleaned automatically you need to call +``html_text.cleaned_selector`` first. Two functions that do it are ``html_text.cleaned_selector`` and ``html_text.selector_to_text``: @@ -90,6 +91,24 @@ Two functions that do it are ``html_text.cleaned_selector`` and and returns cleaned ``parsel.Selector``. * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns extracted text. +* ``html_text.extract_text`` accepts html and returns extracted text. + +If ``guess_page_layout`` is True (False by default for backward compatibility), +a newline is added before and after NEWLINE_TAGS and two newlines are added +before and after DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text +more similar to how it is rendered in the browser. +NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized, here are the lists of +the tags that are handled by default: + +* NEWLINE_TAGS = frozenset([ + 'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset', + 'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main', + 'nav', 'table', 'tr' +]) +* DOUBLE_NEWLINE_TAGS = frozenset([ + 'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', + 'p', 'pre', 'title', 'ul' +]) Credits From 7653bf933bd47a2a39eb20ec5bf21338f9249fd1 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 17:20:44 -0700 Subject: [PATCH 37/40] update history --- CHANGES.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index c6aa51f..8a31755 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,14 @@ History ======= +0.4.0 TDB +------------------ + +* Add ``guess_page_layout`` to make extracted text look like how it is rendered + in browser. +* Add tests of layout extraction for real webpages. + + 0.3.0 (2017-10-12) ------------------ From 4300fe6f14659fdc8b03c335b4ff3ab9a3c02546 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Wed, 19 Sep 2018 17:24:32 -0700 Subject: [PATCH 38/40] update readme --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 77457cc..d947c80 100644 --- a/README.rst +++ b/README.rst @@ -84,14 +84,14 @@ as styles, and then text would be extracted. NB Selectors are not cleaned automatically you need to call ``html_text.cleaned_selector`` first. -Two functions that do it are ``html_text.cleaned_selector`` and +The main functions are ``html_text.extract_text``, ``html_text.cleaned_selector`` and ``html_text.selector_to_text``: +* ``html_text.extract_text`` accepts html and returns extracted text. * ``html_text.cleaned_selector`` accepts html as text or as ``lxml.html.HtmlElement``, and returns cleaned ``parsel.Selector``. * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns extracted text. -* ``html_text.extract_text`` accepts html and returns extracted text. If ``guess_page_layout`` is True (False by default for backward compatibility), a newline is added before and after NEWLINE_TAGS and two newlines are added From 477206178506e77c13fe96d26167855fef5cfe30 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 20 Sep 2018 12:11:40 -0700 Subject: [PATCH 39/40] update readme and add newline personalization tests --- README.rst | 40 +++++++++++++++++++--------------------- html_text/__init__.py | 2 +- tests/test_html_text.py | 20 +++++++++++++++++++- 3 files changed, 39 insertions(+), 23 deletions(-) diff --git a/README.rst b/README.rst index d947c80..9ee8b98 100644 --- a/README.rst +++ b/README.rst @@ -57,18 +57,27 @@ Usage Extract text from HTML:: >>> import html_text - >>> text = html_text.extract_text(u'

    Hello

    world!') + >>> html_text.extract_text(u'

    Hello

    world!') u'Hello world!' - >>> text = html_text.extract_text(u'

    Hello

    world!', guess_page_layout=True) - u'Hello - world!' + >>> html_text.extract_text(u'

    Hello

    world!', guess_page_layout=True) + 'Hello\n\nworld!' + + +It is possible to add specific tags to `html_text.NEWLINE_TAGS` and +`html_text.DOUBLE_NEWLINE_TAGS`: + >>> html_text.extract_text( + u'Hello world!', + guess_page_layout=True, + newline_tags=html_text.NEWLINE_TAGS | {'a'}) + 'Hello\n\nworld!' + You can also pass already parsed ``lxml.html.HtmlElement``: >>> import html_text >>> tree = html_text.parse_html(u'

    Hello

    world!') - >>> text = html_text.extract_text(tree) + >>> html_text.extract_text(tree) u'Hello world!' Or define a selector to extract text only from specific elements: @@ -76,7 +85,7 @@ Or define a selector to extract text only from specific elements: >>> import html_text >>> sel = html_text.cleaned_selector(u'

    Hello

    world!') >>> subsel = sel.xpath('//h1') - >>> text = html_text.selector_to_text(subsel) + >>> html_text.selector_to_text(subsel) u'Hello' Passed html will be first cleaned from invisible non-text content such @@ -94,21 +103,10 @@ The main functions are ``html_text.extract_text``, ``html_text.cleaned_selector` text. If ``guess_page_layout`` is True (False by default for backward compatibility), -a newline is added before and after NEWLINE_TAGS and two newlines are added -before and after DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text -more similar to how it is rendered in the browser. -NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized, here are the lists of -the tags that are handled by default: - -* NEWLINE_TAGS = frozenset([ - 'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset', - 'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main', - 'nav', 'table', 'tr' -]) -* DOUBLE_NEWLINE_TAGS = frozenset([ - 'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', - 'p', 'pre', 'title', 'ul' -]) +a newline is added before and after newline_tags and two newlines are added +before and after double_newline_tags. This heuristic makes the extracted text +more similar to how it is rendered in the browser. Default newline and double +newline tags can be found in `html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`. Credits diff --git a/html_text/__init__.py b/html_text/__init__.py index 61ef192..843b010 100644 --- a/html_text/__init__.py +++ b/html_text/__init__.py @@ -1,4 +1,4 @@ # -*- coding: utf-8 -*- from .html_text import (extract_text, parse_html, cleaned_selector, - selector_to_text) + selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) diff --git a/tests/test_html_text.py b/tests/test_html_text.py index eb8cacf..fe53f98 100644 --- a/tests/test_html_text.py +++ b/tests/test_html_text.py @@ -4,7 +4,7 @@ import glob from html_text import (extract_text, parse_html, cleaned_selector, - selector_to_text) + selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) @pytest.fixture(params=[{ @@ -127,6 +127,24 @@ def test_adjust_newline(): guess_page_layout=True) == ('text 1\n\ntext 2')) +def test_personalize_newlines_sets(): + html = (u'textmore' + 'and more text and some more ') + assert (extract_text( + html, + guess_punct_space=True, + guess_page_layout=True, + newline_tags=NEWLINE_TAGS | {'a'} + ) == 'text\nmore\nand more text\nand some more') + + assert (extract_text( + html, + guess_punct_space=True, + guess_page_layout=True, + double_newline_tags=DOUBLE_NEWLINE_TAGS | {'a'} + ) == 'text\n\nmore\n\nand more text\n\nand some more') + + def test_webpages(): webpages = sorted(glob.glob('./test_webpages/*.html')) extracted = sorted(glob.glob('./test_webpages/*.txt')) From 05b979a6f43fa27d10eb29df1cdfae14b566f235 Mon Sep 17 00:00:00 2001 From: Ludovica Gonella Date: Thu, 20 Sep 2018 14:41:30 -0700 Subject: [PATCH 40/40] change documentation --- html_text/html_text.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/html_text/html_text.py b/html_text/html_text.py index cb64058..ecb7431 100644 --- a/html_text/html_text.py +++ b/html_text/html_text.py @@ -186,11 +186,12 @@ def extract_text(html, and is just a heuristic. When guess_page_layout is True (default is False), a newline is added - before and after NEWLINE_TAGS and two newlines are added before and after - DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text more similar + before and after newline_tags and two newlines are added before and after + double_newline_tags. This heuristic makes the extracted text more similar to how it is rendered in the browser. - NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized. + NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be extended, check readme for + an example on how to do it. html should be a unicode string or an already parsed lxml.html element. """