TeamHG-Memex · kmike · Sep 25, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018
diff --git a/html_text/__init__.py b/html_text/__init__.py
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from .html_text import extract_text, parse_html, html_to_text
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -4,9 +4,11 @@
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
-import parsel
 
 
+NEWLINE_TAGS = ['title', 'p', 'li', 'dd', 'dt', 'dl', 'ul',
+                'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+
 _clean_html = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
@@ -47,45 +49,71 @@ def parse_html(html):
 _has_punct_before = re.compile(r'\($').search
 
 
-def selector_to_text(sel, guess_punct_space=True):
-    """ Convert a cleaned selector to text.
-    See html_text.extract_text docstring for description of the approach and options.
+def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
+    """ Convert a cleaned html tree to text.
+        See html_text.extract_text docstring for description of the approach
+        and options.
     """
-    if guess_punct_space:
-
-        def fragments():
-            prev = None
-            for text in sel.xpath('.//text()').extract():
-                if prev is not None and (_has_trailing_whitespace(prev)
-                                         or (not _has_punct_after(text) and
-                                             not _has_punct_before(prev))):
-                    yield ' '
-                yield text
-                prev = text
 
-        return _whitespace.sub(' ', ''.join(fragments()).strip())
-
-    else:
-        fragments = (x.strip() for x in sel.xpath('.//text()').extract())
-        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
-
-
-def cleaned_selector(html):
-    """ Clean selector.
-    """
-    try:
-        tree = _cleaned_html_tree(html)
-        sel = parsel.Selector(root=tree, type='html')
-    except (lxml.etree.XMLSyntaxError,
-            lxml.etree.ParseError,
-            lxml.etree.ParserError,
-            UnicodeEncodeError):
-        # likely plain text
-        sel = parsel.Selector(html)
-    return sel
-
-
-def extract_text(html, guess_punct_space=True):
+    def add_space(text, prev):
+        return (prev is not None
+                and (not _has_trailing_whitespace(prev)
+                     and (not _has_punct_after(text)
+                     and not _has_punct_before(prev)
+                          )
+                     )
+                )
+
+    def add_newline(tag, prev):
+        return tag in NEWLINE_TAGS and prev != '\n'
+
+    def traverse_text_fragments(tree, prev):
+        space = ''
+        if tree.text:
+            if guess_punct_space:
+                text = _whitespace.sub(' ', tree.text.strip())
+                if text and add_space(text, prev[0]):
+                    space = ' '
+                yield [space, text]
+                prev[0] = text
+                space = ''
+            else:
+                yield [tree.text]
+                prev[0] = tree.text
+
+        for child in tree:
+            for t in traverse_text_fragments(child, prev):
+                yield t
+
+        tail_text = []
+        if guess_page_layout and add_newline(tree.tag, prev[0]):
+            tail_text.append('\n')
+            prev[0] = '\n'
+
+        if tree.tail:
+            if guess_punct_space:
+                text = _whitespace.sub(' ', tree.tail.strip())
+                if text:
+                    if (not tail_text # do not add space after newline
+                        and add_space(text, prev[0])):
+                        tail_text.append(' ')
+
+                    tail_text.append(text)
+                    prev[0] = text
+            else:
+                tail_text.append(tree.tail)
+                prev[0] = tree.tail
+        if tail_text:
+            yield tail_text
+
+    text = []
+    for fragment in traverse_text_fragments(tree, [None]):
+        text.extend(fragment)
+    return ''.join(text).strip()
+
+
+
+def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
     """
     Convert html to text, cleaning invisible content such as styles.
     Almost the same as normalize-space xpath, but this also
@@ -98,5 +126,7 @@ def extract_text(html, guess_punct_space=True):
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    sel = cleaned_selector(html)
-    return selector_to_text(sel, guess_punct_space=guess_punct_space)
+    if html is None or len(html) == 0:
+        return ''
+    cleaned = _cleaned_html_tree(html)
+    return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -1,11 +1,15 @@
 # -*- coding: utf-8 -*-
 import pytest
 
-from html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from html_text import extract_text, html_to_text, parse_html
 
 
 @pytest.fixture(params=[{'guess_punct_space': True},
-                        {'guess_punct_space': False}])
+                        {'guess_punct_space': False},
+                        {'guess_punct_space': True, 'guess_page_layout': True},
+                        {'guess_punct_space': False, 'guess_page_layout': True}
+                        ])
+
 def all_options(request):
     return request.param
 
@@ -49,9 +53,20 @@ def test_punct_whitespace_preserved():
             u'по ле, and , more ! now a (boo)')
 
 
-def test_selector(all_options):
-    html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
-    sel = cleaned_selector(html)
-    assert selector_to_text(sel, **all_options) == 'text more and more text'
-    subsel = sel.xpath('//div[@id="extract-me"]')[0]
-    assert selector_to_text(subsel, **all_options) == 'text more'
+def test_guess_page_layout():
+    html = (u'<title>title</title><div>text_1.<p>text_2 text_3</p><ul>'
+           '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
+           'text_8</p>text_9</div><p>...text_10</p>'
+           )
+    assert (extract_text(html, guess_punct_space=False) ==
+                                        ('titletext_1.text_2 text_3text_4text_5'
+                                        'text_6text_7text_8text_9...text_10'))
+    assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) ==
+                                ('title\ntext_1.text_2 text_3\ntext_4\ntext_5'
+                                '\ntext_6text_7text_8\ntext_9...text_10'))
+    assert (extract_text(html, guess_punct_space=True) ==
+                                    ('title text_1. text_2 text_3 text_4 text_5'
+                                    ' text_6 text_7 text_8 text_9...text_10'))
+    assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) ==
+                                  ('title\ntext_1. text_2 text_3\ntext_4\ntext_5'
+                                  '\ntext_6 text_7 text_8\ntext_9...text_10'))