TeamHG-Memex · kmike · Sep 25, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018
diff --git a/html_text/__init__.py b/html_text/__init__.py
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from .html_text import extract_text, parse_html, html_to_text, cleaned_selector, selector_to_text
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -7,6 +7,9 @@
 import parsel
 
 
+NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol']
+DOUBLE_NEWLINE_TAGS = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+
 _clean_html = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
@@ -44,30 +47,89 @@ def parse_html(html):
 _whitespace = re.compile(r'\s+')
 _has_trailing_whitespace = re.compile(r'\s$').search
 _has_punct_after = re.compile(r'^[,:;.!?"\)]').search
-_has_punct_before = re.compile(r'\($').search
+_has_open_bracket_before = re.compile(r'\($').search
+
 
+def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
+    """
+    Convert a cleaned html tree to text.
+    See html_text.extract_text docstring for description of the approach
+    and options.
+    """
 
-def selector_to_text(sel, guess_punct_space=True):
+    def add_space(text, prev):
+        if prev is None:
+            return False
+        if prev == '\n' or prev == '\n\n':
+            return False
+        if not _has_trailing_whitespace(prev):
+            if _has_punct_after(text) or _has_open_bracket_before(prev):
+                return False
+        return True
+
+    def add_newline(tag, prev):
+        if prev is None or prev == '\n\n':
+            return ''
+        if tag in DOUBLE_NEWLINE_TAGS:
+            if prev == '\n':
+                return '\n'
+            return '\n\n'
+        if tag in NEWLINE_TAGS:
+            if prev == '\n':
+                return ''
+            return '\n'
+        return ''
+
+    def traverse_text_fragments(tree, prev, depth):
+        space = ' '
+        newline = ''
+        if tree.text:
+            text = _whitespace.sub(' ', tree.text.strip())
+            if text:
+                if guess_page_layout:
+                    newline = add_newline(tree.tag, prev[0])
+                    if newline:
+                        prev[0] = newline
+                if guess_punct_space and not add_space(text, prev[0]):
+                    space = ''
+                yield [newline, space, text]
+                prev[0] = tree.text
+                space = ' '
+                newline = ''
+
+        for child in tree:
+            for t in traverse_text_fragments(child, prev, depth+1):
+                yield t
+
+        if guess_page_layout:
+            newline = add_newline(tree.tag, prev[0])
+            if newline:
+                prev[0] = newline
+
+        tail = ''
+        if tree.tail and depth != 0:
+            tail = _whitespace.sub(' ', tree.tail.strip())
+            if tail:
+                if guess_punct_space and not add_space(tail, prev[0]):
+                    space = ''
+        if tail:
+            yield [newline, space, tail]
+            prev[0] = tree.tail
+        elif newline:
+            yield [newline]
+
+    text = []
+    for fragment in traverse_text_fragments(tree, [None], 0):
+        text.extend(fragment)
+    return ''.join(text).strip()
+
+
+def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
     """ Convert a cleaned selector to text.
     See html_text.extract_text docstring for description of the approach and options.
     """
-    if guess_punct_space:
-
-        def fragments():
-            prev = None
-            for text in sel.xpath('.//text()').extract():
-                if prev is not None and (_has_trailing_whitespace(prev)
-                                         or (not _has_punct_after(text) and
-                                             not _has_punct_before(prev))):
-                    yield ' '
-                yield text
-                prev = text
-
-        return _whitespace.sub(' ', ''.join(fragments()).strip())
-
-    else:
-        fragments = (x.strip() for x in sel.xpath('.//text()').extract())
-        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
+    return html_to_text(sel.root, guess_punct_space=guess_punct_space,
+                        guess_page_layout=guess_page_layout)
 
 
 def cleaned_selector(html):
@@ -85,7 +147,7 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, guess_punct_space=True):
+def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
     """
     Convert html to text, cleaning invisible content such as styles.
     Almost the same as normalize-space xpath, but this also
@@ -98,5 +160,7 @@ def extract_text(html, guess_punct_space=True):
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    sel = cleaned_selector(html)
-    return selector_to_text(sel, guess_punct_space=guess_punct_space)
+    if html is None or len(html) == 0:
+        return ''
+    cleaned = _cleaned_html_tree(html)
+    return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout,)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -1,11 +1,17 @@
 # -*- coding: utf-8 -*-
 import pytest
+import lxml
 
-from html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from html_text import (extract_text, html_to_text, parse_html, parse_html,
+                       cleaned_selector, selector_to_text)
 
 
 @pytest.fixture(params=[{'guess_punct_space': True},
-                        {'guess_punct_space': False}])
+                        {'guess_punct_space': False},
+                        {'guess_punct_space': True, 'guess_page_layout': True},
+                        {'guess_punct_space': False, 'guess_page_layout': True}
+                        ])
+
 def all_options(request):
     return request.param
 
@@ -48,10 +54,42 @@ def test_punct_whitespace_preserved():
     assert (extract_text(html, guess_punct_space=True) ==
             u'по ле, and , more ! now a (boo)')
 
-
 def test_selector(all_options):
     html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text'
     subsel = sel.xpath('//div[@id="extract-me"]')[0]
     assert selector_to_text(subsel, **all_options) == 'text more'
+
+
+def test_html_to_text():
+    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p><ul>'
+             '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
+             'text_8</p>text_9</div><p>...text_10</p>')
+
+    parser = lxml.html.HTMLParser(encoding='utf8')
+    tree = lxml.html.fromstring(html.encode('utf8'), parser=parser)
+
+    assert (html_to_text(tree, guess_punct_space=False) ==
+                                    ('title text_1. text_2 text_3 text_4 text_5'
+                                    ' text_6 text_7 text_8 text_9 ...text_10'))
+    assert (html_to_text(tree, guess_punct_space=False, guess_page_layout=True) ==
+                    ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
+                    '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
+    assert (html_to_text(tree, guess_punct_space=True) ==
+                                ('title text_1. text_2 text_3 text_4 text_5'
+                                ' text_6 text_7 text_8 text_9...text_10'))
+    assert (html_to_text(tree, guess_punct_space=True, guess_page_layout=True) ==
+                          ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+                          '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+
+def test_guess_page_layout():
+    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
+             '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
+             '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
+             '<script>document.getElementById("demo").innerHTML = '
+             '"This should be skipped";</script> <p>...text_10</p>'
+           )
+    assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) ==
+                          ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+                          '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))