From 0ae6d24cf5ef038f9cf5913cb0cce0d5bb714cbd Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Fri, 24 Aug 2018 14:48:54 -0700
Subject: [PATCH 01/40] add first working approach plus debug code

---
 html_text/html_text.py | 80 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 1b0462c..98920d0 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -6,7 +6,6 @@
 from lxml.html.clean import Cleaner
 import parsel
 
-
 _clean_html = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
@@ -69,6 +68,59 @@ def fragments():
         fragments = (x.strip() for x in sel.xpath('.//text()').extract())
         return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
 
+def selector_to_text_new(tree, guess_punct_space=True, guess_page_layout=False):
+    """ Convert a cleaned selector to text.
+    See html_text.extract_text docstring for description of the approach and options.
+    """
+
+    if guess_punct_space:
+        def add_newline(tag):
+            if tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']:
+                return '\n'
+            return ''
+
+        def traverse_text_fragments(tree, prev):
+            space = ''
+            newline = ''
+            if tree.text:
+                text = _whitespace.sub(' ', tree.text.strip())
+                if text:
+                    if prev[0] is not None and (not _has_trailing_whitespace(prev[0])
+                                        and (not _has_punct_after(tree.text) and
+                                            not _has_punct_before(prev[0]))):
+                        space = ' '
+                    if guess_page_layout:
+                        newline = add_newline(tree.tag)
+                    yield [space, text, newline]
+                    prev[0] = (newline or text)
+                    space = ''
+                    newline = ''
+
+            for child in tree:  # where is my precious "yield from"?
+                for t in traverse_text_fragments(child, prev):
+                    yield t
+
+            if tree.tail:
+                text = _whitespace.sub(' ', tree.tail.strip())
+                if text:
+                    if prev[0] is not None and (not _has_trailing_whitespace(prev[0])
+                                        and (not _has_punct_after(tree.tail) and
+                                            not _has_punct_before(prev[0]))):
+                        space = ' '
+                    if guess_page_layout:
+                        newline = add_newline(tree.tag)
+                    yield [space, text, newline]
+                    prev[0] = (newline or text)
+        
+        text = []
+        for fragment in traverse_text_fragments(tree, [None]):
+            text.extend(fragment)
+        return ''.join(text).strip()
+
+    else:
+        # fragments = (x.strip() for x in sel.xpath('.//text()').extract())
+        # return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
+        pass
 
 def cleaned_selector(html):
     """ Clean selector.
@@ -85,7 +137,7 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, guess_punct_space=True):
+def extract_text(html, guess_punct_space=True, guess_page_layout=False):
     """
     Convert html to text, cleaning invisible content such as styles.
     Almost the same as normalize-space xpath, but this also
@@ -98,5 +150,27 @@ def extract_text(html, guess_punct_space=True):
 
     html should be a unicode string or an already parsed lxml.html element.
     """
+    # from time import time
+
+    
+    cleaned = _clean_html(html)
+    # t1 = time()
+    res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout)
+    # t2 = time()
+    # print('NEW')
+    # print('clean_time: ', t1 - t0)
+    # print('text_time: ', t2 - t1)
+    # print('total_time: ', t2 - t0)
+    # else:
+    #     # t0 = time()
     sel = cleaned_selector(html)
-    return selector_to_text(sel, guess_punct_space=guess_punct_space)
+    # t1 = time()
+    old = selector_to_text(sel, guess_punct_space=guess_punct_space)
+    # t2 = time()
+    # print('OLD')
+    # print('clean_time: ', t1 - t0)
+    # print('text_time: ', t2 - t1)
+    # print('total_time: ', t2 - t0)
+    # print('')
+    # t0 = time()
+    return res, old
\ No newline at end of file

From 566dc9b71c14f92f0ae38b84282ec76116343199 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Fri, 24 Aug 2018 16:16:21 -0700
Subject: [PATCH 02/40] add newline only at the end of selected tags

---
 html_text/html_text.py | 70 +++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 98920d0..92f2bfd 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -74,43 +74,42 @@ def selector_to_text_new(tree, guess_punct_space=True, guess_page_layout=False):
     """
 
     if guess_punct_space:
-        def add_newline(tag):
-            if tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']:
-                return '\n'
-            return ''
 
         def traverse_text_fragments(tree, prev):
             space = ''
-            newline = ''
             if tree.text:
                 text = _whitespace.sub(' ', tree.text.strip())
                 if text:
                     if prev[0] is not None and (not _has_trailing_whitespace(prev[0])
                                         and (not _has_punct_after(tree.text) and
-                                            not _has_punct_before(prev[0]))):
+                                             not _has_punct_before(prev[0]))):
                         space = ' '
-                    if guess_page_layout:
-                        newline = add_newline(tree.tag)
-                    yield [space, text, newline]
-                    prev[0] = (newline or text)
+                    
+                    yield [space, text]
+                    prev[0] = text
                     space = ''
-                    newline = ''
 
             for child in tree:  # where is my precious "yield from"?
                 for t in traverse_text_fragments(child, prev):
                     yield t
+            
+            tail_text = []
+            if guess_page_layout and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']:
+                tail_text.append('\n')
+                prev[0] = '\n'
 
             if tree.tail:
                 text = _whitespace.sub(' ', tree.tail.strip())
                 if text:
-                    if prev[0] is not None and (not _has_trailing_whitespace(prev[0])
-                                        and (not _has_punct_after(tree.tail) and
-                                            not _has_punct_before(prev[0]))):
-                        space = ' '
-                    if guess_page_layout:
-                        newline = add_newline(tree.tag)
-                    yield [space, text, newline]
-                    prev[0] = (newline or text)
+                    if (not tail_text and prev[0] is not None and 
+                        not _has_trailing_whitespace(prev[0]) and 
+                        not _has_punct_after(tree.tail) and
+                        not _has_punct_before(prev[0])):
+                        tail_text.append(' ')
+                    tail_text.append(text)
+                    prev[0] = text
+            if tail_text:
+                yield tail_text
         
         text = []
         for fragment in traverse_text_fragments(tree, [None]):
@@ -137,7 +136,7 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, guess_punct_space=True, guess_page_layout=False):
+def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
     """
     Convert html to text, cleaning invisible content such as styles.
     Almost the same as normalize-space xpath, but this also
@@ -152,20 +151,21 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False):
     """
     # from time import time
 
-    
-    cleaned = _clean_html(html)
-    # t1 = time()
-    res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout)
-    # t2 = time()
-    # print('NEW')
-    # print('clean_time: ', t1 - t0)
-    # print('text_time: ', t2 - t1)
-    # print('total_time: ', t2 - t0)
-    # else:
-    #     # t0 = time()
-    sel = cleaned_selector(html)
-    # t1 = time()
-    old = selector_to_text(sel, guess_punct_space=guess_punct_space)
+    if new:
+        cleaned = _clean_html(html)
+        # t1 = time()
+        res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout)
+        # t2 = time()
+        # print('NEW')
+        # print('clean_time: ', t1 - t0)
+        # print('text_time: ', t2 - t1)
+        # print('total_time: ', t2 - t0)
+        # else:
+        #     # t0 = time()
+    else:
+        sel = cleaned_selector(html)
+        # t1 = time()
+        res = selector_to_text(sel, guess_punct_space=guess_punct_space)
     # t2 = time()
     # print('OLD')
     # print('clean_time: ', t1 - t0)
@@ -173,4 +173,4 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False):
     # print('total_time: ', t2 - t0)
     # print('')
     # t0 = time()
-    return res, old
\ No newline at end of file
+    return res
\ No newline at end of file

From 587e9a713a1eb4448c6d11954bccda6f3281ff02 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 27 Aug 2018 12:04:03 -0700
Subject: [PATCH 03/40] fix multiple consecutive newlines

---
 html_text/html_text.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 92f2bfd..6bc1c23 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -94,7 +94,9 @@ def traverse_text_fragments(tree, prev):
                     yield t
             
             tail_text = []
-            if guess_page_layout and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']:
+            if (guess_page_layout 
+                and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']
+                and prev[0] != '\n'):
                 tail_text.append('\n')
                 prev[0] = '\n'
 

From 6c9d27e3623416a95950100d310600d0e88bbcb3 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 27 Aug 2018 13:34:09 -0700
Subject: [PATCH 04/40] add guess_space = False option

---
 html_text/__init__.py  |   2 +-
 html_text/html_text.py | 160 ++++++++++++++---------------------------
 2 files changed, 55 insertions(+), 107 deletions(-)

diff --git a/html_text/__init__.py b/html_text/__init__.py
index db40e63..9c9d86a 100644
--- a/html_text/__init__.py
+++ b/html_text/__init__.py
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from .html_text import extract_text, parse_html, html_to_text
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 6bc1c23..62ecb2f 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -4,7 +4,6 @@
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
-import parsel
 
 _clean_html = Cleaner(
     scripts=True,
@@ -46,96 +45,66 @@ def parse_html(html):
 _has_punct_before = re.compile(r'\($').search
 
 
-def selector_to_text(sel, guess_punct_space=True):
-    """ Convert a cleaned selector to text.
-    See html_text.extract_text docstring for description of the approach and options.
+def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
+    """ Convert a cleaned html tree to text.
+        See html_text.extract_text docstring for description of the approach
+        and options.
     """
-    if guess_punct_space:
 
-        def fragments():
-            prev = None
-            for text in sel.xpath('.//text()').extract():
-                if prev is not None and (_has_trailing_whitespace(prev)
-                                         or (not _has_punct_after(text) and
-                                             not _has_punct_before(prev))):
-                    yield ' '
-                yield text
-                prev = text
-
-        return _whitespace.sub(' ', ''.join(fragments()).strip())
-
-    else:
-        fragments = (x.strip() for x in sel.xpath('.//text()').extract())
-        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
-
-def selector_to_text_new(tree, guess_punct_space=True, guess_page_layout=False):
-    """ Convert a cleaned selector to text.
-    See html_text.extract_text docstring for description of the approach and options.
-    """
-
-    if guess_punct_space:
-
-        def traverse_text_fragments(tree, prev):
-            space = ''
-            if tree.text:
+    def should_add_space(text, prev):
+        return (prev is not None
+                and (not _has_trailing_whitespace(prev)
+                     and (not _has_punct_after(text)
+                     and not _has_punct_before(prev)
+                          )
+                     )
+                )
+
+    def traverse_text_fragments(tree, prev):
+        space = ''
+        if tree.text:
+            if guess_punct_space:
                 text = _whitespace.sub(' ', tree.text.strip())
-                if text:
-                    if prev[0] is not None and (not _has_trailing_whitespace(prev[0])
-                                        and (not _has_punct_after(tree.text) and
-                                             not _has_punct_before(prev[0]))):
-                        space = ' '
-                    
-                    yield [space, text]
-                    prev[0] = text
-                    space = ''
-
-            for child in tree:  # where is my precious "yield from"?
-                for t in traverse_text_fragments(child, prev):
-                    yield t
-            
-            tail_text = []
-            if (guess_page_layout 
-                and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']
-                and prev[0] != '\n'):
-                tail_text.append('\n')
-                prev[0] = '\n'
-
-            if tree.tail:
+                if text and should_add_space(text, prev[0]):
+                    space = ' '
+                yield [space, text]
+                prev[0] = text
+                space = ''
+            else:
+                yield[tree.text]
+
+        for child in tree:
+            for t in traverse_text_fragments(child, prev):
+                yield t
+
+        tail_text = []
+        if (guess_page_layout
+            and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']
+            and prev[0] != '\n'
+            ):
+            tail_text.append('\n')
+            prev[0] = '\n'
+
+        if tree.tail:
+            if guess_punct_space:
                 text = _whitespace.sub(' ', tree.tail.strip())
                 if text:
-                    if (not tail_text and prev[0] is not None and 
-                        not _has_trailing_whitespace(prev[0]) and 
-                        not _has_punct_after(tree.tail) and
-                        not _has_punct_before(prev[0])):
+                    if (not tail_text # do not add space after newline
+                        and should_add_space(text, prev[0])):
                         tail_text.append(' ')
+
                     tail_text.append(text)
                     prev[0] = text
-            if tail_text:
-                yield tail_text
-        
-        text = []
-        for fragment in traverse_text_fragments(tree, [None]):
-            text.extend(fragment)
-        return ''.join(text).strip()
+            else:
+                tail_text.append(tree.tail)
+        if tail_text:
+            yield tail_text
 
-    else:
-        # fragments = (x.strip() for x in sel.xpath('.//text()').extract())
-        # return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
-        pass
+    text = []
+    for fragment in traverse_text_fragments(tree, [None]):
+        text.extend(fragment)
+    return ''.join(text).strip()
 
-def cleaned_selector(html):
-    """ Clean selector.
-    """
-    try:
-        tree = _cleaned_html_tree(html)
-        sel = parsel.Selector(root=tree, type='html')
-    except (lxml.etree.XMLSyntaxError,
-            lxml.etree.ParseError,
-            lxml.etree.ParserError,
-            UnicodeEncodeError):
-        # likely plain text
-        sel = parsel.Selector(html)
-    return sel
 
 
 def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
@@ -151,28 +120,7 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    # from time import time
-
-    if new:
-        cleaned = _clean_html(html)
-        # t1 = time()
-        res = selector_to_text_new(cleaned, guess_page_layout=guess_page_layout)
-        # t2 = time()
-        # print('NEW')
-        # print('clean_time: ', t1 - t0)
-        # print('text_time: ', t2 - t1)
-        # print('total_time: ', t2 - t0)
-        # else:
-        #     # t0 = time()
-    else:
-        sel = cleaned_selector(html)
-        # t1 = time()
-        res = selector_to_text(sel, guess_punct_space=guess_punct_space)
-    # t2 = time()
-    # print('OLD')
-    # print('clean_time: ', t1 - t0)
-    # print('text_time: ', t2 - t1)
-    # print('total_time: ', t2 - t0)
-    # print('')
-    # t0 = time()
-    return res
\ No newline at end of file
+    if not html:
+        return ''
+    cleaned = _cleaned_html_tree(html)
+    return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout)

From c22f3fa6ede49560c39e6ad702b310fe32b5ca62 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Tue, 28 Aug 2018 12:39:18 -0700
Subject: [PATCH 05/40] move add space and newline checks to a function

---
 html_text/html_text.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 62ecb2f..007a739 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -5,6 +5,10 @@
 import lxml.etree
 from lxml.html.clean import Cleaner
 
+
+NEWLINE_TAGS = ['title', 'p', 'li', 'dd', 'dt', 'dl', 'ul',
+                'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+
 _clean_html = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
@@ -51,7 +55,7 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
         and options.
     """
 
-    def should_add_space(text, prev):
+    def add_space(text, prev):
         return (prev is not None
                 and (not _has_trailing_whitespace(prev)
                      and (not _has_punct_after(text)
@@ -60,28 +64,29 @@ def should_add_space(text, prev):
                      )
                 )
 
+    def add_newline(tag, prev):
+        return tag in NEWLINE_TAGS and prev != '\n'
+
     def traverse_text_fragments(tree, prev):
         space = ''
         if tree.text:
             if guess_punct_space:
                 text = _whitespace.sub(' ', tree.text.strip())
-                if text and should_add_space(text, prev[0]):
+                if text and add_space(text, prev[0]):
                     space = ' '
                 yield [space, text]
                 prev[0] = text
                 space = ''
             else:
-                yield[tree.text]
+                yield [tree.text]
+                prev[0] = tree.text
 
         for child in tree:
             for t in traverse_text_fragments(child, prev):
                 yield t
 
         tail_text = []
-        if (guess_page_layout
-            and tree.tag in ['title', 'p', 'h1', 'li', 'dd', 'dt', 'dl']
-            and prev[0] != '\n'
-            ):
+        if guess_page_layout and add_newline(tree.tag, prev[0]):
             tail_text.append('\n')
             prev[0] = '\n'
 
@@ -90,13 +95,14 @@ def traverse_text_fragments(tree, prev):
                 text = _whitespace.sub(' ', tree.tail.strip())
                 if text:
                     if (not tail_text # do not add space after newline
-                        and should_add_space(text, prev[0])):
+                        and add_space(text, prev[0])):
                         tail_text.append(' ')
 
                     tail_text.append(text)
                     prev[0] = text
             else:
                 tail_text.append(tree.tail)
+                prev[0] = tree.tail
         if tail_text:
             yield tail_text
 
@@ -120,7 +126,7 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True
 
     html should be a unicode string or an already parsed lxml.html element.
     """
-    if not html:
+    if html is None or len(html) == 0:
         return ''
     cleaned = _cleaned_html_tree(html)
     return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout)

From 8a78fc58c5d6a721489038158564112b24bd3f49 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Tue, 28 Aug 2018 12:39:40 -0700
Subject: [PATCH 06/40] add tests guess_page_layout

---
 tests/test_html_text.py | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 594ea3c..648c319 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,11 +1,15 @@
 # -*- coding: utf-8 -*-
 import pytest
 
-from html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from html_text import extract_text, html_to_text, parse_html
 
 
 @pytest.fixture(params=[{'guess_punct_space': True},
-                        {'guess_punct_space': False}])
+                        {'guess_punct_space': False},
+                        {'guess_punct_space': True, 'guess_page_layout': True},
+                        {'guess_punct_space': False, 'guess_page_layout': True}
+                        ])
+
 def all_options(request):
     return request.param
 
@@ -49,9 +53,27 @@ def test_punct_whitespace_preserved():
             u'по ле, and , more ! now a (boo)')
 
 
-def test_selector(all_options):
-    html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
-    sel = cleaned_selector(html)
-    assert selector_to_text(sel, **all_options) == 'text more and more text'
-    subsel = sel.xpath('//div[@id="extract-me"]')[0]
-    assert selector_to_text(subsel, **all_options) == 'text more'
+# def test_selector(all_options):
+#     html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
+#     sel = cleaned_selector(html)
+#     assert selector_to_text(sel, **all_options) == 'text more and more text'
+#     subsel = sel.xpath('//div[@id="extract-me"]')[0]
+#     assert selector_to_text(subsel, **all_options) == 'text more'
+
+def test_guess_page_layout():
+    html = (u'<title>title</title><div>text_1.<p>text_2 text_3</p><ul>'
+           '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
+           'text_8</p>text_9</div><p>...text_10</p>'
+           )
+    assert (extract_text(html, guess_punct_space=False) ==
+                                        ('titletext_1.text_2 text_3text_4text_5'
+                                        'text_6text_7text_8text_9...text_10'))
+    assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) ==
+                                ('title\ntext_1.text_2 text_3\ntext_4\ntext_5'
+                                '\ntext_6text_7text_8\ntext_9...text_10'))
+    assert (extract_text(html, guess_punct_space=True) ==
+                                    ('title text_1. text_2 text_3 text_4 text_5'
+                                    ' text_6 text_7 text_8 text_9...text_10'))
+    assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) ==
+                                  ('title\ntext_1. text_2 text_3\ntext_4\ntext_5'
+                                  '\ntext_6 text_7 text_8\ntext_9...text_10'))

From a783e3134265d7653b2cd31a2f97d4d4a3e8ae6d Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 29 Aug 2018 11:32:24 -0700
Subject: [PATCH 07/40] remove old test

---
 tests/test_html_text.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 648c319..ec0cdc6 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -53,13 +53,6 @@ def test_punct_whitespace_preserved():
             u'по ле, and , more ! now a (boo)')
 
 
-# def test_selector(all_options):
-#     html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
-#     sel = cleaned_selector(html)
-#     assert selector_to_text(sel, **all_options) == 'text more and more text'
-#     subsel = sel.xpath('//div[@id="extract-me"]')[0]
-#     assert selector_to_text(subsel, **all_options) == 'text more'
-
 def test_guess_page_layout():
     html = (u'<title>title</title><div>text_1.<p>text_2 text_3</p><ul>'
            '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'

From cb8dc1cf3831c11b1408006fcb56c2c74795a716 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 30 Aug 2018 13:13:06 -0700
Subject: [PATCH 08/40] guess_punct_space = False behavior same as before this
 PR

---
 html_text/html_text.py | 49 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 007a739..28d4592 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -56,6 +56,7 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
     """
 
     def add_space(text, prev):
+        # return True if a space should be added
         return (prev is not None
                 and (not _has_trailing_whitespace(prev)
                      and (not _has_punct_after(text)
@@ -68,43 +69,41 @@ def add_newline(tag, prev):
         return tag in NEWLINE_TAGS and prev != '\n'
 
     def traverse_text_fragments(tree, prev):
-        space = ''
+        space = ' '
         if tree.text:
-            if guess_punct_space:
-                text = _whitespace.sub(' ', tree.text.strip())
-                if text and add_space(text, prev[0]):
-                    space = ' '
+            text = _whitespace.sub(' ', tree.text.strip())
+            if text:
+                if guess_punct_space and not add_space(text, prev[0]):
+                    space = ''
                 yield [space, text]
                 prev[0] = text
-                space = ''
-            else:
-                yield [tree.text]
-                prev[0] = tree.text
+                space = ' '
 
         for child in tree:
             for t in traverse_text_fragments(child, prev):
                 yield t
 
-        tail_text = []
+        newline = ''
         if guess_page_layout and add_newline(tree.tag, prev[0]):
-            tail_text.append('\n')
+            newline = '\n'
             prev[0] = '\n'
 
+        tail = ''
         if tree.tail:
-            if guess_punct_space:
-                text = _whitespace.sub(' ', tree.tail.strip())
-                if text:
-                    if (not tail_text # do not add space after newline
-                        and add_space(text, prev[0])):
-                        tail_text.append(' ')
-
-                    tail_text.append(text)
-                    prev[0] = text
-            else:
-                tail_text.append(tree.tail)
-                prev[0] = tree.tail
-        if tail_text:
-            yield tail_text
+            tail = _whitespace.sub(' ', tree.tail.strip())
+            if tail:
+                if (guess_punct_space
+                    and (not add_space(tail, prev[0]) or newline)):
+                    space = ''
+
+        if tail:
+            yield [newline, space, tail]
+            prev[0] = tail
+            # space = ' '
+            # newline = ''
+        elif newline:
+            yield [newline]
+            # newline = ''
 
     text = []
     for fragment in traverse_text_fragments(tree, [None]):

From fb599bcc08c7c0af5fba6f000b01390046cd41cd Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 30 Aug 2018 13:13:13 -0700
Subject: [PATCH 09/40] fix tests

---
 tests/test_html_text.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index ec0cdc6..455ebff 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -54,16 +54,16 @@ def test_punct_whitespace_preserved():
 
 
 def test_guess_page_layout():
-    html = (u'<title>title</title><div>text_1.<p>text_2 text_3</p><ul>'
-           '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
-           'text_8</p>text_9</div><p>...text_10</p>'
+    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p><ul>'
+             '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
+             'text_8</p>text_9</div><p>...text_10</p>'
            )
     assert (extract_text(html, guess_punct_space=False) ==
-                                        ('titletext_1.text_2 text_3text_4text_5'
-                                        'text_6text_7text_8text_9...text_10'))
+                                    ('title text_1. text_2 text_3 text_4 text_5'
+                                    ' text_6 text_7 text_8 text_9 ...text_10'))
     assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) ==
-                                ('title\ntext_1.text_2 text_3\ntext_4\ntext_5'
-                                '\ntext_6text_7text_8\ntext_9...text_10'))
+                            ('title\n text_1. text_2 text_3\n text_4\n text_5'
+                            '\n text_6 text_7 text_8\n text_9 ...text_10'))
     assert (extract_text(html, guess_punct_space=True) ==
                                     ('title text_1. text_2 text_3 text_4 text_5'
                                     ' text_6 text_7 text_8 text_9...text_10'))

From 90e37b76a17142f26011100ca5c67d33017fd5d4 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 30 Aug 2018 13:36:34 -0700
Subject: [PATCH 10/40] fixed tests

---
 html_text/html_text.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 28d4592..f52cbe0 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -46,7 +46,7 @@ def parse_html(html):
 _whitespace = re.compile(r'\s+')
 _has_trailing_whitespace = re.compile(r'\s$').search
 _has_punct_after = re.compile(r'^[,:;.!?"\)]').search
-_has_punct_before = re.compile(r'\($').search
+_has_open_bracket_before = re.compile(r'\($').search
 
 
 def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
@@ -57,10 +57,12 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
 
     def add_space(text, prev):
         # return True if a space should be added
+        if prev == '\n':
+            return False
         return (prev is not None
-                and (not _has_trailing_whitespace(prev)
-                     and (not _has_punct_after(text)
-                     and not _has_punct_before(prev)
+                and (_has_trailing_whitespace(prev)
+                     or (not _has_punct_after(text)
+                     and not _has_open_bracket_before(prev)
                           )
                      )
                 )
@@ -76,7 +78,7 @@ def traverse_text_fragments(tree, prev):
                 if guess_punct_space and not add_space(text, prev[0]):
                     space = ''
                 yield [space, text]
-                prev[0] = text
+                prev[0] = tree.text
                 space = ' '
 
         for child in tree:
@@ -98,12 +100,9 @@ def traverse_text_fragments(tree, prev):
 
         if tail:
             yield [newline, space, tail]
-            prev[0] = tail
-            # space = ' '
-            # newline = ''
+            prev[0] = tree.tail
         elif newline:
             yield [newline]
-            # newline = ''
 
     text = []
     for fragment in traverse_text_fragments(tree, [None]):

From ae26d29ea693a87498595b0af5609f223e0b7590 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 30 Aug 2018 16:46:34 -0700
Subject: [PATCH 11/40] fix indent and make add_space more readable

---
 html_text/html_text.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index f52cbe0..3943835 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -50,22 +50,21 @@ def parse_html(html):
 
 
 def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
-    """ Convert a cleaned html tree to text.
-        See html_text.extract_text docstring for description of the approach
-        and options.
+    """
+    Convert a cleaned html tree to text.
+    See html_text.extract_text docstring for description of the approach
+    and options.
     """
 
     def add_space(text, prev):
-        # return True if a space should be added
+        if prev is None:
+            return False
         if prev == '\n':
             return False
-        return (prev is not None
-                and (_has_trailing_whitespace(prev)
-                     or (not _has_punct_after(text)
-                     and not _has_open_bracket_before(prev)
-                          )
-                     )
-                )
+        if not _has_trailing_whitespace(prev):
+            if _has_punct_after(text) or _has_open_bracket_before(prev):
+                return False
+        return True
 
     def add_newline(tag, prev):
         return tag in NEWLINE_TAGS and prev != '\n'

From bb33d4b51d77a6bcd9a7fb107f2033a6d83c5133 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 30 Aug 2018 17:25:52 -0700
Subject: [PATCH 12/40] add double newline before and after title, p and h tags

---
 html_text/html_text.py  | 40 ++++++++++++++++++++++++++--------------
 tests/test_html_text.py | 12 ++++++------
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 3943835..cb09a9a 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -6,8 +6,8 @@
 from lxml.html.clean import Cleaner
 
 
-NEWLINE_TAGS = ['title', 'p', 'li', 'dd', 'dt', 'dl', 'ul',
-                'ol', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol']
+DOUBLE_NEWLINE_TAGS = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 
 _clean_html = Cleaner(
     scripts=True,
@@ -59,7 +59,7 @@ def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
     def add_space(text, prev):
         if prev is None:
             return False
-        if prev == '\n':
+        if prev == '\n' or prev == '\n\n':
             return False
         if not _has_trailing_whitespace(prev):
             if _has_punct_after(text) or _has_open_bracket_before(prev):
@@ -67,36 +67,50 @@ def add_space(text, prev):
         return True
 
     def add_newline(tag, prev):
-        return tag in NEWLINE_TAGS and prev != '\n'
+        if prev is None or prev == '\n\n':
+            return ''
+        if tag in DOUBLE_NEWLINE_TAGS:
+            if prev == '\n':
+                return '\n'
+            return '\n\n'
+        if tag in NEWLINE_TAGS:
+            if prev == '\n':
+                return ''
+            return '\n'
+        return ''
 
     def traverse_text_fragments(tree, prev):
         space = ' '
+        newline = ''
         if tree.text:
             text = _whitespace.sub(' ', tree.text.strip())
             if text:
+                if guess_page_layout:
+                    newline = add_newline(tree.tag, prev[0])
+                    if newline:
+                        prev[0] = newline
                 if guess_punct_space and not add_space(text, prev[0]):
                     space = ''
-                yield [space, text]
+                yield [newline, space, text]
                 prev[0] = tree.text
                 space = ' '
+                newline = ''
 
         for child in tree:
             for t in traverse_text_fragments(child, prev):
                 yield t
 
-        newline = ''
-        if guess_page_layout and add_newline(tree.tag, prev[0]):
-            newline = '\n'
-            prev[0] = '\n'
+        if guess_page_layout:
+            newline = add_newline(tree.tag, prev[0])
+            if newline:
+                prev[0] = newline
 
         tail = ''
         if tree.tail:
             tail = _whitespace.sub(' ', tree.tail.strip())
             if tail:
-                if (guess_punct_space
-                    and (not add_space(tail, prev[0]) or newline)):
+                if guess_punct_space and not add_space(tail, prev[0]):
                     space = ''
-
         if tail:
             yield [newline, space, tail]
             prev[0] = tree.tail
@@ -108,8 +122,6 @@ def traverse_text_fragments(tree, prev):
         text.extend(fragment)
     return ''.join(text).strip()
 
-
-
 def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
     """
     Convert html to text, cleaning invisible content such as styles.
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 455ebff..96ce01d 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -62,11 +62,11 @@ def test_guess_page_layout():
                                     ('title text_1. text_2 text_3 text_4 text_5'
                                     ' text_6 text_7 text_8 text_9 ...text_10'))
     assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) ==
-                            ('title\n text_1. text_2 text_3\n text_4\n text_5'
-                            '\n text_6 text_7 text_8\n text_9 ...text_10'))
+                    ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
+                    '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
     assert (extract_text(html, guess_punct_space=True) ==
-                                    ('title text_1. text_2 text_3 text_4 text_5'
-                                    ' text_6 text_7 text_8 text_9...text_10'))
+                                ('title text_1. text_2 text_3 text_4 text_5'
+                                ' text_6 text_7 text_8 text_9...text_10'))
     assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) ==
-                                  ('title\ntext_1. text_2 text_3\ntext_4\ntext_5'
-                                  '\ntext_6 text_7 text_8\ntext_9...text_10'))
+                          ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+                          '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))

From 3069a7339ac29731e32ce5f26ca31351cf6cf41b Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 6 Sep 2018 07:58:16 -0700
Subject: [PATCH 13/40] by default tail of root node will not be extracted

---
 html_text/__init__.py   |  2 +-
 html_text/html_text.py  | 35 ++++++++++++++++++++++++++++++-----
 tests/test_html_text.py | 10 +++++++++-
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/html_text/__init__.py b/html_text/__init__.py
index 9c9d86a..661a8a1 100644
--- a/html_text/__init__.py
+++ b/html_text/__init__.py
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, html_to_text
+from .html_text import extract_text, parse_html, html_to_text, cleaned_selector, selector_to_text
diff --git a/html_text/html_text.py b/html_text/html_text.py
index cb09a9a..6053480 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -4,6 +4,7 @@
 import lxml
 import lxml.etree
 from lxml.html.clean import Cleaner
+import parsel
 
 
 NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol']
@@ -79,7 +80,7 @@ def add_newline(tag, prev):
             return '\n'
         return ''
 
-    def traverse_text_fragments(tree, prev):
+    def traverse_text_fragments(tree, prev, depth):
         space = ' '
         newline = ''
         if tree.text:
@@ -97,7 +98,7 @@ def traverse_text_fragments(tree, prev):
                 newline = ''
 
         for child in tree:
-            for t in traverse_text_fragments(child, prev):
+            for t in traverse_text_fragments(child, prev, depth+1):
                 yield t
 
         if guess_page_layout:
@@ -106,7 +107,7 @@ def traverse_text_fragments(tree, prev):
                 prev[0] = newline
 
         tail = ''
-        if tree.tail:
+        if tree.tail and depth != 0:
             tail = _whitespace.sub(' ', tree.tail.strip())
             if tail:
                 if guess_punct_space and not add_space(tail, prev[0]):
@@ -118,10 +119,34 @@ def traverse_text_fragments(tree, prev):
             yield [newline]
 
     text = []
-    for fragment in traverse_text_fragments(tree, [None]):
+    for fragment in traverse_text_fragments(tree, [None], 0):
         text.extend(fragment)
     return ''.join(text).strip()
 
+
+def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
+    """ Convert a cleaned selector to text.
+    See html_text.extract_text docstring for description of the approach and options.
+    """
+    return html_to_text(sel.root, guess_punct_space=guess_punct_space,
+                        guess_page_layout=guess_page_layout)
+
+
+def cleaned_selector(html):
+    """ Clean selector.
+    """
+    try:
+        tree = _cleaned_html_tree(html)
+        sel = parsel.Selector(root=tree, type='html')
+    except (lxml.etree.XMLSyntaxError,
+            lxml.etree.ParseError,
+            lxml.etree.ParserError,
+            UnicodeEncodeError):
+        # likely plain text
+        sel = parsel.Selector(html)
+    return sel
+
+
 def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
     """
     Convert html to text, cleaning invisible content such as styles.
@@ -138,4 +163,4 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True
     if html is None or len(html) == 0:
         return ''
     cleaned = _cleaned_html_tree(html)
-    return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout)
+    return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout,)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 96ce01d..ff81fa7 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 import pytest
 
-from html_text import extract_text, html_to_text, parse_html
+from html_text import (extract_text, html_to_text, parse_html, parse_html,
+                       cleaned_selector, selector_to_text)
 
 
 @pytest.fixture(params=[{'guess_punct_space': True},
@@ -52,6 +53,13 @@ def test_punct_whitespace_preserved():
     assert (extract_text(html, guess_punct_space=True) ==
             u'по ле, and , more ! now a (boo)')
 
+def test_selector(all_options):
+    html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
+    sel = cleaned_selector(html)
+    assert selector_to_text(sel, **all_options) == 'text more and more text'
+    subsel = sel.xpath('//div[@id="extract-me"]')[0]
+    assert selector_to_text(subsel, **all_options) == 'text more'
+
 
 def test_guess_page_layout():
     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p><ul>'

From dd032013843361d5d3e9eedad231ff2116c7e4e9 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 6 Sep 2018 12:18:29 -0700
Subject: [PATCH 14/40] add test

---
 tests/test_html_text.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index ff81fa7..6f180f7 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 import pytest
+import lxml
 
 from html_text import (extract_text, html_to_text, parse_html, parse_html,
                        cleaned_selector, selector_to_text)
@@ -61,20 +62,34 @@ def test_selector(all_options):
     assert selector_to_text(subsel, **all_options) == 'text more'
 
 
-def test_guess_page_layout():
+def test_html_to_text():
     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p><ul>'
              '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
-             'text_8</p>text_9</div><p>...text_10</p>'
-           )
-    assert (extract_text(html, guess_punct_space=False) ==
+             'text_8</p>text_9</div><p>...text_10</p>')
+
+    parser = lxml.html.HTMLParser(encoding='utf8')
+    tree = lxml.html.fromstring(html.encode('utf8'), parser=parser)
+
+    assert (html_to_text(tree, guess_punct_space=False) ==
                                     ('title text_1. text_2 text_3 text_4 text_5'
                                     ' text_6 text_7 text_8 text_9 ...text_10'))
-    assert (extract_text(html, guess_punct_space=False, guess_page_layout=True) ==
+    assert (html_to_text(tree, guess_punct_space=False, guess_page_layout=True) ==
                     ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
                     '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
-    assert (extract_text(html, guess_punct_space=True) ==
+    assert (html_to_text(tree, guess_punct_space=True) ==
                                 ('title text_1. text_2 text_3 text_4 text_5'
                                 ' text_6 text_7 text_8 text_9...text_10'))
+    assert (html_to_text(tree, guess_punct_space=True, guess_page_layout=True) ==
+                          ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+                          '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+
+def test_guess_page_layout():
+    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
+             '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
+             '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
+             '<script>document.getElementById("demo").innerHTML = '
+             '"This should be skipped";</script> <p>...text_10</p>'
+           )
     assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) ==
                           ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
                           '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))

From 0f2fb2b840d71538cf874027a8026167fd96d2e6 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Fri, 7 Sep 2018 12:45:17 -0700
Subject: [PATCH 15/40] fix indentation

---
 tests/test_html_text.py | 72 +++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 6f180f7..9a64aea 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -2,16 +2,21 @@
 import pytest
 import lxml
 
-from html_text import (extract_text, html_to_text, parse_html, parse_html,
+from html_text import (extract_text, html_to_text, parse_html,
                        cleaned_selector, selector_to_text)
 
 
-@pytest.fixture(params=[{'guess_punct_space': True},
-                        {'guess_punct_space': False},
-                        {'guess_punct_space': True, 'guess_page_layout': True},
-                        {'guess_punct_space': False, 'guess_page_layout': True}
-                        ])
-
+@pytest.fixture(params=[{
+    'guess_punct_space': True
+}, {
+    'guess_punct_space': False
+}, {
+    'guess_punct_space': True,
+    'guess_page_layout': True
+}, {
+    'guess_punct_space': False,
+    'guess_page_layout': True
+}])
 def all_options(request):
     return request.param
 
@@ -51,8 +56,9 @@ def test_punct_whitespace():
 def test_punct_whitespace_preserved():
     html = (u'<div><span>по</span><span>ле</span>, and  ,  '
             u'<span>more </span>!<span>now</div>a (<b>boo</b>)')
-    assert (extract_text(html, guess_punct_space=True) ==
-            u'по ле, and , more ! now a (boo)')
+    assert (extract_text(
+        html, guess_punct_space=True) == u'по ле, and , more ! now a (boo)')
+
 
 def test_selector(all_options):
     html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
@@ -64,32 +70,36 @@ def test_selector(all_options):
 
 def test_html_to_text():
     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p><ul>'
-             '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
-             'text_8</p>text_9</div><p>...text_10</p>')
+            '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
+            'text_8</p>text_9</div><p>...text_10</p>')
 
     parser = lxml.html.HTMLParser(encoding='utf8')
     tree = lxml.html.fromstring(html.encode('utf8'), parser=parser)
 
-    assert (html_to_text(tree, guess_punct_space=False) ==
-                                    ('title text_1. text_2 text_3 text_4 text_5'
-                                    ' text_6 text_7 text_8 text_9 ...text_10'))
-    assert (html_to_text(tree, guess_punct_space=False, guess_page_layout=True) ==
-                    ('title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
-                    '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
-    assert (html_to_text(tree, guess_punct_space=True) ==
-                                ('title text_1. text_2 text_3 text_4 text_5'
-                                ' text_6 text_7 text_8 text_9...text_10'))
-    assert (html_to_text(tree, guess_punct_space=True, guess_page_layout=True) ==
-                          ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
-                          '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+    assert (html_to_text(tree, guess_punct_space=False) == (
+        'title text_1. text_2 text_3 text_4 text_5'
+        ' text_6 text_7 text_8 text_9 ...text_10'))
+    assert (html_to_text(
+        tree, guess_punct_space=False, guess_page_layout=True) == (
+            'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
+            '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
+    assert (html_to_text(
+        tree,
+        guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5'
+                                    ' text_6 text_7 text_8 text_9...text_10'))
+    assert (html_to_text(
+        tree, guess_punct_space=True, guess_page_layout=True) == (
+            'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+            '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+
 
 def test_guess_page_layout():
     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
-             '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
-             '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
-             '<script>document.getElementById("demo").innerHTML = '
-             '"This should be skipped";</script> <p>...text_10</p>'
-           )
-    assert (extract_text(html, guess_punct_space=True, guess_page_layout=True) ==
-                          ('title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
-                          '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+            '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
+            '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
+            '<script>document.getElementById("demo").innerHTML = '
+            '"This should be skipped";</script> <p>...text_10</p>')
+    assert (extract_text(
+        html, guess_punct_space=True, guess_page_layout=True) == (
+            'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+            '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))

From e8da507281e06487fc635e648a34403a11f858f6 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Fri, 7 Sep 2018 12:46:55 -0700
Subject: [PATCH 16/40] newline tags as set and extendable, add new features
 comments, delete new argument

---
 html_text/html_text.py | 43 +++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 6053480..9805dcd 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -6,9 +6,9 @@
 from lxml.html.clean import Cleaner
 import parsel
 
-
-NEWLINE_TAGS = ['li', 'dd', 'dt', 'dl', 'ul', 'ol']
-DOUBLE_NEWLINE_TAGS = ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+NEWLINE_TAGS = frozenset(['li', 'dd', 'dt', 'dl', 'ul', 'ol'])
+DOUBLE_NEWLINE_TAGS = frozenset(
+    ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
 
 _clean_html = Cleaner(
     scripts=True,
@@ -98,7 +98,7 @@ def traverse_text_fragments(tree, prev, depth):
                 newline = ''
 
         for child in tree:
-            for t in traverse_text_fragments(child, prev, depth+1):
+            for t in traverse_text_fragments(child, prev, depth + 1):
                 yield t
 
         if guess_page_layout:
@@ -126,10 +126,13 @@ def traverse_text_fragments(tree, prev, depth):
 
 def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
     """ Convert a cleaned selector to text.
-    See html_text.extract_text docstring for description of the approach and options.
+    See html_text.extract_text docstring for description of the approach
+    and options.
     """
-    return html_to_text(sel.root, guess_punct_space=guess_punct_space,
-                        guess_page_layout=guess_page_layout)
+    return html_to_text(
+        sel.root,
+        guess_punct_space=guess_punct_space,
+        guess_page_layout=guess_page_layout)
 
 
 def cleaned_selector(html):
@@ -138,16 +141,18 @@ def cleaned_selector(html):
     try:
         tree = _cleaned_html_tree(html)
         sel = parsel.Selector(root=tree, type='html')
-    except (lxml.etree.XMLSyntaxError,
-            lxml.etree.ParseError,
-            lxml.etree.ParserError,
-            UnicodeEncodeError):
+    except (lxml.etree.XMLSyntaxError, lxml.etree.ParseError,
+            lxml.etree.ParserError, UnicodeEncodeError):
         # likely plain text
         sel = parsel.Selector(html)
     return sel
 
 
-def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True):
+def extract_text(html,
+                 guess_punct_space=True,
+                 guess_page_layout=False,
+                 newline_tags=NEWLINE_TAGS,
+                 double_newline_tags=DOUBLE_NEWLINE_TAGS):
     """
     Convert html to text, cleaning invisible content such as styles.
     Almost the same as normalize-space xpath, but this also
@@ -158,9 +163,21 @@ def extract_text(html, guess_punct_space=True, guess_page_layout=False, new=True
     for punctuation. This has a slight (around 10%) performance overhead
     and is just a heuristic.
 
+    When guess_page_layout is True (default is False), a newline is added after
+    NEWLINE_TAGS and two newlines after DOUBLE_NEWLINE_TAGS. This heuristic
+    makes the extracted text more similar to how it looks like in the browser.
+
+    NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized.
+
     html should be a unicode string or an already parsed lxml.html element.
     """
     if html is None or len(html) == 0:
         return ''
     cleaned = _cleaned_html_tree(html)
-    return html_to_text(cleaned, guess_punct_space=guess_punct_space, guess_page_layout=guess_page_layout,)
+    return html_to_text(
+        cleaned,
+        guess_punct_space=guess_punct_space,
+        guess_page_layout=guess_page_layout,
+        newline_tags=newline_tags,
+        double_newline_tags=double_newline_tags,
+    )

From 0b9d1398639f836c9127b2310180850083312892 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Fri, 7 Sep 2018 17:22:18 -0700
Subject: [PATCH 17/40] make html_to_text private, fix its signature

---
 html_text/__init__.py  |  3 ++-
 html_text/html_text.py | 14 +++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/html_text/__init__.py b/html_text/__init__.py
index 661a8a1..61ef192 100644
--- a/html_text/__init__.py
+++ b/html_text/__init__.py
@@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, html_to_text, cleaned_selector, selector_to_text
+from .html_text import (extract_text, parse_html, cleaned_selector,
+                        selector_to_text)
diff --git a/html_text/html_text.py b/html_text/html_text.py
index 9805dcd..cbb32e1 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -50,7 +50,11 @@ def parse_html(html):
 _has_open_bracket_before = re.compile(r'\($').search
 
 
-def html_to_text(tree, guess_punct_space=True, guess_page_layout=False):
+def _html_to_text(tree,
+                  guess_punct_space=True,
+                  guess_page_layout=False,
+                  newline_tags=NEWLINE_TAGS,
+                  double_newline_tags=DOUBLE_NEWLINE_TAGS):
     """
     Convert a cleaned html tree to text.
     See html_text.extract_text docstring for description of the approach
@@ -70,11 +74,11 @@ def add_space(text, prev):
     def add_newline(tag, prev):
         if prev is None or prev == '\n\n':
             return ''
-        if tag in DOUBLE_NEWLINE_TAGS:
+        if tag in double_newline_tags:
             if prev == '\n':
                 return '\n'
             return '\n\n'
-        if tag in NEWLINE_TAGS:
+        if tag in newline_tags:
             if prev == '\n':
                 return ''
             return '\n'
@@ -129,7 +133,7 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
     See html_text.extract_text docstring for description of the approach
     and options.
     """
-    return html_to_text(
+    return _html_to_text(
         sel.root,
         guess_punct_space=guess_punct_space,
         guess_page_layout=guess_page_layout)
@@ -174,7 +178,7 @@ def extract_text(html,
     if html is None or len(html) == 0:
         return ''
     cleaned = _cleaned_html_tree(html)
-    return html_to_text(
+    return _html_to_text(
         cleaned,
         guess_punct_space=guess_punct_space,
         guess_page_layout=guess_page_layout,

From ba7cdc0f39cb1c1483d0afd0b8183b8da6d8484b Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Fri, 7 Sep 2018 17:59:01 -0700
Subject: [PATCH 18/40] add new tags to handle

---
 html_text/html_text.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index cbb32e1..8b56252 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -6,9 +6,14 @@
 from lxml.html.clean import Cleaner
 import parsel
 
-NEWLINE_TAGS = frozenset(['li', 'dd', 'dt', 'dl', 'ul', 'ol'])
-DOUBLE_NEWLINE_TAGS = frozenset(
-    ['title', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
+NEWLINE_TAGS = frozenset([
+    'br', 'article', 'aside', 'details', 'div', 'dd', 'dt', 'fieldset',
+    'figcaption', 'form', 'hr', 'li', 'main', 'nav', 'table', 'tr'
+])
+DOUBLE_NEWLINE_TAGS = frozenset([
+    'blockquote', 'dl', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header',
+    'ol', 'ul', 'p', 'pre', 'title', 'figure'
+])
 
 _clean_html = Cleaner(
     scripts=True,

From 952d8957030b5c5413f4eba7b1ba5743f9b5a4a1 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 16:45:21 -0700
Subject: [PATCH 19/40] handle more tags

---
 html_text/html_text.py  | 42 ++++++++++++++++++----------
 tests/test_html_text.py | 62 ++++++++++++++++-------------------------
 2 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 8b56252..6dd7e36 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -7,12 +7,19 @@
 import parsel
 
 NEWLINE_TAGS = frozenset([
-    'br', 'article', 'aside', 'details', 'div', 'dd', 'dt', 'fieldset',
-    'figcaption', 'form', 'hr', 'li', 'main', 'nav', 'table', 'tr'
+    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
+    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
+    'nav', 'table', 'tr'
 ])
 DOUBLE_NEWLINE_TAGS = frozenset([
-    'blockquote', 'dl', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header',
-    'ol', 'ul', 'p', 'pre', 'title', 'figure'
+    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
+    'p', 'pre', 'title', 'ul'
+])
+INLINE_TEXT_TAGS = frozenset([
+    'abbr', 'acronym', 'b', 'bdi', 'bdo', 'cite', 'code', 'data', 'del', 'dfn',
+    'em', 'i', 'ins', 'kbd', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's',
+    'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'tt', 'u', 'var',
+    'wbr',
 ])
 
 _clean_html = Cleaner(
@@ -30,6 +37,7 @@
     annoying_tags=False,
     remove_unknown_tags=False,
     safe_attrs_only=False,
+    remove_tags=INLINE_TEXT_TAGS,  # helps newline placement if guess_page_layout=True
 ).clean_html
 
 
@@ -92,19 +100,23 @@ def add_newline(tag, prev):
     def traverse_text_fragments(tree, prev, depth):
         space = ' '
         newline = ''
+        text = ''
+        if guess_page_layout:
+            newline = add_newline(tree.tag, prev[0])
+            if newline:
+                prev[0] = newline
         if tree.text:
             text = _whitespace.sub(' ', tree.text.strip())
-            if text:
-                if guess_page_layout:
-                    newline = add_newline(tree.tag, prev[0])
-                    if newline:
-                        prev[0] = newline
-                if guess_punct_space and not add_space(text, prev[0]):
-                    space = ''
-                yield [newline, space, text]
-                prev[0] = tree.text
-                space = ' '
-                newline = ''
+            if text and guess_punct_space and not add_space(text, prev[0]):
+                space = ''
+        if text:
+            yield [newline, space, text]
+            prev[0] = tree.text
+            space = ' '
+            newline = ''
+        elif newline:
+            yield [newline]
+            newline = ''
 
         for child in tree:
             for t in traverse_text_fragments(child, prev, depth + 1):
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 9a64aea..995d673 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -2,8 +2,8 @@
 import pytest
 import lxml
 
-from html_text import (extract_text, html_to_text, parse_html,
-                       cleaned_selector, selector_to_text)
+from html_text import (extract_text, parse_html, cleaned_selector,
+                       selector_to_text)
 
 
 @pytest.fixture(params=[{
@@ -67,39 +67,25 @@ def test_selector(all_options):
     subsel = sel.xpath('//div[@id="extract-me"]')[0]
     assert selector_to_text(subsel, **all_options) == 'text more'
 
-
-def test_html_to_text():
-    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p><ul>'
-            '<li>text_4</li><li>text_5</li></ul><p>text_6<em>text_7</em>'
-            'text_8</p>text_9</div><p>...text_10</p>')
-
-    parser = lxml.html.HTMLParser(encoding='utf8')
-    tree = lxml.html.fromstring(html.encode('utf8'), parser=parser)
-
-    assert (html_to_text(tree, guess_punct_space=False) == (
-        'title text_1. text_2 text_3 text_4 text_5'
-        ' text_6 text_7 text_8 text_9 ...text_10'))
-    assert (html_to_text(
-        tree, guess_punct_space=False, guess_page_layout=True) == (
-            'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
-            '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
-    assert (html_to_text(
-        tree,
-        guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5'
-                                    ' text_6 text_7 text_8 text_9...text_10'))
-    assert (html_to_text(
-        tree, guess_punct_space=True, guess_page_layout=True) == (
-            'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
-            '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
-
-
-def test_guess_page_layout():
-    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
-            '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
-            '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
-            '<script>document.getElementById("demo").innerHTML = '
-            '"This should be skipped";</script> <p>...text_10</p>')
-    assert (extract_text(
-        html, guess_punct_space=True, guess_page_layout=True) == (
-            'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
-            '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+#
+# def test_guess_page_layout():
+#     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
+#             '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
+#             '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
+#             '<script>document.getElementById("demo").innerHTML = '
+#             '"This should be skipped";</script> <p>...text_10</p>')
+#     assert (extract_text(html, guess_punct_space=False) == (
+#         'title text_1. text_2 text_3 text_4 text_5'
+#         ' text_6 text_7 text_8 text_9 ...text_10'))
+#     assert (extract_text(
+#         html, guess_punct_space=False, guess_page_layout=True) == (
+#             'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
+#             '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
+#     assert (extract_text(
+#         html,
+#         guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5'
+#                                     ' text_6 text_7 text_8 text_9...text_10'))
+#     assert (extract_text(
+#         html, guess_punct_space=True, guess_page_layout=True) == (
+#             'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+#             '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))

From 9dafbf026c8f0e1f2a3b91d88f9dd10c1c5d6a1f Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 17:22:30 -0700
Subject: [PATCH 20/40] remove cleaning of inline tags

---
 html_text/html_text.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 6dd7e36..c1dc717 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -15,12 +15,6 @@
     'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
     'p', 'pre', 'title', 'ul'
 ])
-INLINE_TEXT_TAGS = frozenset([
-    'abbr', 'acronym', 'b', 'bdi', 'bdo', 'cite', 'code', 'data', 'del', 'dfn',
-    'em', 'i', 'ins', 'kbd', 'mark', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 's',
-    'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'tt', 'u', 'var',
-    'wbr',
-])
 
 _clean_html = Cleaner(
     scripts=True,
@@ -37,7 +31,6 @@
     annoying_tags=False,
     remove_unknown_tags=False,
     safe_attrs_only=False,
-    remove_tags=INLINE_TEXT_TAGS,  # helps newline placement if guess_page_layout=True
 ).clean_html
 
 

From b3229d6add78590b0c25a44663ce04c582c8f5ba Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 17:41:16 -0700
Subject: [PATCH 21/40] fix bug with multiple newlines

---
 html_text/html_text.py  | 22 ++++++++----------
 tests/test_html_text.py | 51 ++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index c1dc717..9530f9b 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -79,25 +79,23 @@ def add_space(text, prev):
 
     def add_newline(tag, prev):
         if prev is None or prev == '\n\n':
-            return ''
+            return '', '\n\n'
         if tag in double_newline_tags:
             if prev == '\n':
-                return '\n'
-            return '\n\n'
+                return '\n', '\n\n'
+            return '\n\n', '\n\n'
         if tag in newline_tags:
             if prev == '\n':
-                return ''
-            return '\n'
-        return ''
+                return '', '\n'
+            return '\n', '\n'
+        return '', ''
 
     def traverse_text_fragments(tree, prev, depth):
         space = ' '
         newline = ''
         text = ''
         if guess_page_layout:
-            newline = add_newline(tree.tag, prev[0])
-            if newline:
-                prev[0] = newline
+            newline, prev[0] = add_newline(tree.tag, prev[0])
         if tree.text:
             text = _whitespace.sub(' ', tree.text.strip())
             if text and guess_punct_space and not add_space(text, prev[0]):
@@ -116,10 +114,8 @@ def traverse_text_fragments(tree, prev, depth):
                 yield t
 
         if guess_page_layout:
-            newline = add_newline(tree.tag, prev[0])
-            if newline:
-                prev[0] = newline
-
+            newline, prev[0] = add_newline(tree.tag, prev[0])
+            
         tail = ''
         if tree.tail and depth != 0:
             tail = _whitespace.sub(' ', tree.tail.strip())
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 995d673..584e1fc 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -61,31 +61,34 @@ def test_punct_whitespace_preserved():
 
 
 def test_selector(all_options):
-    html = '<div><div id="extract-me">text<div>more</div></div>and more text</div>'
+    html = (
+        u'<span><span id="extract-me">text<span>more</span></span>and more text</span>'
+    )
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text'
-    subsel = sel.xpath('//div[@id="extract-me"]')[0]
+    subsel = sel.xpath('//span[@id="extract-me"]')[0]
     assert selector_to_text(subsel, **all_options) == 'text more'
 
-#
-# def test_guess_page_layout():
-#     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
-#             '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
-#             '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
-#             '<script>document.getElementById("demo").innerHTML = '
-#             '"This should be skipped";</script> <p>...text_10</p>')
-#     assert (extract_text(html, guess_punct_space=False) == (
-#         'title text_1. text_2 text_3 text_4 text_5'
-#         ' text_6 text_7 text_8 text_9 ...text_10'))
-#     assert (extract_text(
-#         html, guess_punct_space=False, guess_page_layout=True) == (
-#             'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
-#             '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
-#     assert (extract_text(
-#         html,
-#         guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5'
-#                                     ' text_6 text_7 text_8 text_9...text_10'))
-#     assert (extract_text(
-#         html, guess_punct_space=True, guess_page_layout=True) == (
-#             'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
-#             '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+
+
+def test_guess_page_layout():
+    html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
+            '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
+            '<p>text_6<em>text_7</em>text_8</p>text_9</div>'
+            '<script>document.getElementById("demo").innerHTML = '
+            '"This should be skipped";</script> <p>...text_10</p>')
+    assert (extract_text(html, guess_punct_space=False) == (
+        'title text_1. text_2 text_3 text_4 text_5'
+        ' text_6 text_7 text_8 text_9 ...text_10'))
+    assert (extract_text(
+        html, guess_punct_space=False, guess_page_layout=True) == (
+            'title\n\n text_1.\n\n text_2 text_3\n\n text_4\n text_5'
+            '\n\n text_6 text_7 text_8\n\n text_9\n\n ...text_10'))
+    assert (extract_text(
+        html,
+        guess_punct_space=True) == ('title text_1. text_2 text_3 text_4 text_5'
+                                    ' text_6 text_7 text_8 text_9...text_10'))
+    assert (extract_text(
+        html, guess_punct_space=True, guess_page_layout=True) == (
+            'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
+            '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))

From 695b458fa185929fc99611ffedeaeee648121821 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 17:42:11 -0700
Subject: [PATCH 22/40] remove newline

---
 tests/test_html_text.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 584e1fc..4a9ef42 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -70,7 +70,6 @@ def test_selector(all_options):
     assert selector_to_text(subsel, **all_options) == 'text more'
 
 
-
 def test_guess_page_layout():
     html = (u'<title>  title  </title><div>text_1.<p>text_2 text_3</p>'
             '<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'

From 03259b9f398898ddf92fd2ff0e297ff3a93349ce Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 17:47:50 -0700
Subject: [PATCH 23/40] add test html without text

---
 tests/test_html_text.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 4a9ef42..8e93ac5 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -21,6 +21,13 @@ def all_options(request):
     return request.param
 
 
+def test_extract_no_text_html(all_options):
+    html = (u'<!DOCTYPE html><html><body><p><video width="320" height="240" '
+            'controls><source src="movie.mp4" type="video/mp4"><source '
+            'src="movie.ogg" type="video/ogg"></video></p></body></html>')
+    assert extract_text(html, **all_options) == u''
+
+
 def test_extract_text(all_options):
     html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
     assert extract_text(html, **all_options) == u'Hello, world!'

From cba531fe2330c9c14d82ef9ba8a717876c588759 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 18:26:58 -0700
Subject: [PATCH 24/40] fix newline + space bug

---
 html_text/html_text.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 9530f9b..9c9a20a 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -86,9 +86,9 @@ def add_newline(tag, prev):
             return '\n\n', '\n\n'
         if tag in newline_tags:
             if prev == '\n':
-                return '', '\n'
+                return '', prev
             return '\n', '\n'
-        return '', ''
+        return '', prev
 
     def traverse_text_fragments(tree, prev, depth):
         space = ' '

From 9811349bd06cf52e3b573a224e83bc60485eeaa2 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 18:27:35 -0700
Subject: [PATCH 25/40] add bad punct test

---
 tests/test_html_text.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 8e93ac5..83845e4 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -58,6 +58,7 @@ def test_inline_tags_whitespace(all_options):
 def test_punct_whitespace():
     html = u'<div><span>field</span>, and more</div>'
     assert extract_text(html, guess_punct_space=False) == u'field , and more'
+    assert extract_text(html, guess_punct_space=True) == u'field, and more'
 
 
 def test_punct_whitespace_preserved():
@@ -67,6 +68,19 @@ def test_punct_whitespace_preserved():
         html, guess_punct_space=True) == u'по ле, and , more ! now a (boo)')
 
 
+def test_bad_punct_whitespace():
+    html = (u'<pre><span>trees</span> '
+            '<span>=</span> <span>webstruct</span>'
+            '<span>.</span><span>load_trees</span>'
+            '<span>(</span><span>&quot;train/*.html&quot;</span>'
+            '<span>)</span></pre>')
+    assert extract_text(
+        html, guess_punct_space=False) == (
+            u'trees = webstruct . load_trees ( "train/*.html" )')
+    assert extract_text(
+        html, guess_punct_space=True) == (
+            u'trees = webstruct. load_trees ("train/*.html")')
+
 def test_selector(all_options):
     html = (
         u'<span><span id="extract-me">text<span>more</span></span>and more text</span>'

From d47138cb09013e806df27179ca8e469a8c350501 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 18:27:53 -0700
Subject: [PATCH 26/40] add newline

---
 tests/test_html_text.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 83845e4..f98c58d 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -81,6 +81,7 @@ def test_bad_punct_whitespace():
         html, guess_punct_space=True) == (
             u'trees = webstruct. load_trees ("train/*.html")')
 
+
 def test_selector(all_options):
     html = (
         u'<span><span id="extract-me">text<span>more</span></span>and more text</span>'

From 76f9028c5e1c0b1e22561bce5b383e3f906d83be Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Mon, 10 Sep 2018 19:06:51 -0700
Subject: [PATCH 27/40] add tests on real webpages

---
 tests/test_html_text.py                       |  14 +
 ...the Attic | Books to Scrape - Sandbox.html | 361 +++++++++++
 ... the Attic | Books to Scrape - Sandbox.txt |  30 +
 ...00\224 IANA-managed Reserved Domains.html" | 233 +++++++
 ...200\224 IANA-managed Reserved Domains.txt" | 105 ++++
 .../Scrapinghub Enterprise Solutions.html     |   3 +
 .../Scrapinghub Enterprise Solutions.txt      | 230 +++++++
 ...\200\224 Webstruct 0.6 documentation.html" | 590 ++++++++++++++++++
 ...2\200\224 Webstruct 0.6 documentation.txt" | 214 +++++++
 ...\200\224 Webstruct 0.6 documentation.html" | 357 +++++++++++
 ...2\200\224 Webstruct 0.6 documentation.txt" |  91 +++
 11 files changed, 2228 insertions(+)
 create mode 100644 tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html
 create mode 100644 tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt
 create mode 100644 "tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html"
 create mode 100644 "tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt"
 create mode 100644 tests/test_webpages/Scrapinghub Enterprise Solutions.html
 create mode 100644 tests/test_webpages/Scrapinghub Enterprise Solutions.txt
 create mode 100644 "tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html"
 create mode 100644 "tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt"
 create mode 100644 "tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html"
 create mode 100644 "tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt"

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index f98c58d..612668d 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 import pytest
 import lxml
+import glob
+from pathlib import Path
 
 from html_text import (extract_text, parse_html, cleaned_selector,
                        selector_to_text)
@@ -113,3 +115,15 @@ def test_guess_page_layout():
         html, guess_punct_space=True, guess_page_layout=True) == (
             'title\n\ntext_1.\n\ntext_2 text_3\n\ntext_4\ntext_5'
             '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
+
+
+def test_webpages():
+    webpages = sorted(glob.glob('./test_webpages/*.html'))
+    extracted = sorted(glob.glob('./test_webpages/*.txt'))
+    for page, extr in zip(webpages, extracted):
+        with open(page, 'r', encoding='utf8') as f_in:
+            html = f_in.read()
+        with open(extr, 'r', encoding='utf8') as f_in:
+            expected = f_in.read()
+        assert (extract_text(
+            html, guess_punct_space=True, guess_page_layout=True) == expected)
diff --git a/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html
new file mode 100644
index 0000000..2c2d627
--- /dev/null
+++ b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.html	
@@ -0,0 +1,361 @@
+
+
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
+    <head>
+        <title>
+    A Light in the Attic | Books to Scrape - Sandbox
+</title>
+
+        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+        <meta name="created" content="24th Jun 2016 09:29" />
+        <meta name="description" content="
+    It&#39;s hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein&#39;s humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It&#39;s hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein&#39;s humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon&#39;t you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here&#39;sGot it in for you. Shel, you never sounded so good. ...more
+" />
+        <meta name="viewport" content="width=device-width" />
+        <meta name="robots" content="NOARCHIVE,NOCACHE" />
+
+        <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
+        <!--[if lt IE 9]>
+        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
+        <![endif]-->
+
+        
+            <link rel="shortcut icon" href="../../static/oscar/favicon.ico" />
+        
+
+        
+        
+    
+    
+        <link rel="stylesheet" type="text/css" href="../../static/oscar/css/styles.css" />
+    
+    <link rel="stylesheet" href="../../static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.css" />
+    <link rel="stylesheet" type="text/css" href="../../static/oscar/css/datetimepicker.css" />
+
+
+        
+        
+
+        
+
+        
+            
+            
+
+        
+    </head>
+
+    <body id="default" class="default">
+        
+        
+    
+    
+    <header class="header container-fluid">
+        <div class="page_inner">
+            <div class="row">
+                <div class="col-sm-8 h1"><a href="../../index.html">Books to Scrape</a><small> We love being scraped!</small>
+</div>
+
+                
+            </div>
+        </div>
+    </header>
+
+    
+    
+        <div class="container-fluid page">
+            <div class="page_inner">
+                
+<ul class="breadcrumb">
+    <li>
+        <a href="../../index.html">Home</a>
+    </li>
+    
+        
+        <li>
+            <a href="../category/books_1/index.html">Books</a>
+        </li>
+        
+        <li>
+            <a href="../category/books/poetry_23/index.html">Poetry</a>
+        </li>
+        
+        <li class="active">A Light in the Attic</li>
+
+        
+        
+    
+</ul>
+
+                
+
+                
+
+
+
+<div id="messages">
+
+</div>
+
+                
+                <div class="content">
+                    
+
+                    
+                    <div id="promotions">
+                        
+                    </div>
+
+                    
+                    <div id="content_inner">
+
+<article class="product_page"><!-- Start of product page -->
+
+    <div class="row">
+
+        
+        <div class="col-sm-6">
+            
+
+
+
+
+    
+
+    
+
+        
+        <div id="product_gallery" class="carousel">
+            <div class="thumbnail">
+                <div class="carousel-inner">
+                    <div class="item active">
+                    
+                        
+                            <img src="../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg" alt="A Light in the Attic" />
+                        
+                    
+                    </div>
+                </div>
+            </div>
+        </div>
+
+    
+
+
+        </div>
+        
+
+        
+        <div class="col-sm-6 product_main">
+            
+            
+            <h1>A Light in the Attic</h1>
+
+            
+                
+
+
+
+
+
+
+    
+        <p class="price_color">£51.77</p>
+    
+
+<p class="instock availability">
+    <i class="icon-ok"></i>
+    
+        In stock (22 available)
+    
+</p>
+
+            
+
+            
+                
+
+
+
+    <p class="star-rating Three">
+        <i class="icon-star"></i>
+        <i class="icon-star"></i>
+        <i class="icon-star"></i>
+        <i class="icon-star"></i>
+        <i class="icon-star"></i>
+
+        <!-- <small><a href="/catalogue/a-light-in-the-attic_1000/reviews/">
+        
+                
+                    0 customer reviews
+                
+        </a></small>
+         -->&nbsp;
+
+
+<!-- 
+    <a id="write_review" href="/catalogue/a-light-in-the-attic_1000/reviews/add/#addreview" class="btn btn-success btn-sm">
+        Write a review
+    </a>
+
+ --></p>
+
+            
+
+            <hr/>
+
+            <div class="alert alert-warning" role="alert"><strong>Warning!</strong> This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.</div>
+
+
+            
+                
+
+
+
+
+
+
+            
+        </div><!-- /col-sm-6 -->
+        
+
+    </div><!-- /row -->
+
+    
+        
+        <div id="product_description" class="sub-header">
+            <h2>Product Description</h2>
+        </div>
+        <p>It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more</p>
+        
+    
+
+    
+    <div class="sub-header">
+        <h2>Product Information</h2>
+    </div>
+    <table class="table table-striped">
+        
+        <tr>
+            <th>UPC</th><td>a897fe39b1053632</td>
+        </tr>
+        
+        <tr>
+            <th>Product Type</th><td>Books</td>
+        </tr>
+
+        
+        
+            <tr>
+                <th>Price (excl. tax)</th><td>£51.77</td>
+            </tr>
+            
+                <tr>
+                    <th>Price (incl. tax)</th><td>£51.77</td>
+                </tr>
+                <tr>
+                    <th>Tax</th><td>£0.00</td>
+                </tr>
+            
+            <tr>
+                <th>Availability</th>
+                <td>In stock (22 available)</td>
+            </tr>
+        
+        
+        
+            <tr>
+                <th>Number of reviews</th>
+                <td>0</td>
+            </tr>
+        
+    </table>
+    
+
+    
+        
+        <section>
+            <div id="reviews" class="sub-header">
+            </div>
+        </section>
+        
+    
+
+    
+        
+    
+
+    
+
+
+
+    
+
+
+
+</article><!-- End of product page -->
+</div>
+                </div>
+            </div>
+        </div>
+    
+
+    
+<footer class="footer container-fluid">
+    
+        
+    
+</footer>
+
+
+        
+        
+  
+            <!-- jQuery -->
+            <script src="http://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
+            <script>window.jQuery || document.write('<script src="../../static/oscar/js/jquery/jquery-1.9.1.min.js"><\/script>')</script>
+        
+  
+
+
+        
+        
+    
+        
+    <!-- Twitter Bootstrap -->
+    <script type="text/javascript" src="../../static/oscar/js/bootstrap3/bootstrap.min.js"></script>
+    <!-- Oscar -->
+    <script src="../../static/oscar/js/oscar/ui.js" type="text/javascript" charset="utf-8"></script>
+
+    <script src="../../static/oscar/js/bootstrap-datetimepicker/bootstrap-datetimepicker.js" type="text/javascript" charset="utf-8"></script>
+    <script src="../../static/oscar/js/bootstrap-datetimepicker/locales/bootstrap-datetimepicker.all.js" type="text/javascript" charset="utf-8"></script>
+
+
+        
+        
+    
+    
+
+    
+
+
+
+        
+        <script type="text/javascript">
+            $(function() {
+                
+    
+    oscar.init();
+
+            });
+        </script>
+
+        
+        <!-- Version: N/A -->
+        
+    </body>
+</html>
diff --git a/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt
new file mode 100644
index 0000000..4c664a8
--- /dev/null
+++ b/tests/test_webpages/A Light in the Attic | Books to Scrape - Sandbox.txt	
@@ -0,0 +1,30 @@
+A Light in the Attic | Books to Scrape - Sandbox
+
+Books to Scrape We love being scraped!
+
+Home
+Books
+Poetry
+A Light in the Attic
+
+A Light in the Attic
+
+£51.77
+
+In stock (22 available)
+
+Warning! This is a demo website for web scraping purposes. Prices and ratings here were randomly assigned and have no real meaning.
+
+Product Description
+
+It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounded so good. ...more
+
+Product Information
+
+UPC a897fe39b1053632
+Product Type Books
+Price (excl. tax) £51.77
+Price (incl. tax) £51.77
+Tax £0.00
+Availability In stock (22 available)
+Number of reviews 0
\ No newline at end of file
diff --git "a/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html" "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html"
new file mode 100644
index 0000000..ccf988f
--- /dev/null
+++ "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.html"	
@@ -0,0 +1,233 @@
+<!doctype html>
+<html>
+<head>
+	<title>IANA — IANA-managed Reserved Domains</title>
+
+	<meta charset="utf-8" />
+	<meta http-equiv="Content-type" content="text/html; charset=utf-8" />
+	<meta name="viewport" content="width=device-width, initial-scale=1" />
+	
+	<link rel="stylesheet" media="screen" href="/_css/2015.1/screen.css"/>
+	<link rel="stylesheet" media="print" href="/_css/2015.1/print.css"/>
+	<link rel="shortcut icon" type="image/ico" href="/_img/bookmark_icon.ico"/>
+
+	<script type="text/javascript" src="/_js/2013.1/jquery.js"></script>
+	<script type="text/javascript" src="/_js/2013.1/iana.js"></script>
+
+	
+
+</head>
+
+<body>
+	
+	<header>
+		<div id="header">
+			<div id="logo">
+				<a href="/"><img src="/_img/2013.1/iana-logo-header.svg" alt="Homepage"/></a>
+			</div>
+			<div class="navigation">
+				<ul>
+					<li><a href="/domains">Domains</a></li>
+					<li><a href="/numbers">Numbers</a></li>
+					<li><a href="/protocols">Protocols</a></li>
+					<li><a href="/about">About Us</a></li>
+				</ul>
+			</div>
+		</div>
+	</header>
+	
+	<div id="body">
+	
+
+			<div id="main_right">
+
+
+	<h1>IANA-managed Reserved Domains</h1>
+
+	<p>Certain domains are set aside, and nominally registered to &ldquo;IANA&rdquo;, for specific
+		policy or technical purposes.</p>
+
+	<h2>Example domains</h2>
+	
+	<p>As described in <a href="/go/rfc2606">RFC 2606</a> and <a href="/go/rfc6761">RFC 6761</a>,
+	a number of domains such as <span class="domain label">example.com</span> and <span class="domain label">example.org</span>
+	are maintained for documentation purposes. These domains may be used as illustrative
+	examples in documents without prior coordination with us. They are 
+	not available for registration or transfer.</p>
+
+	<h2>Test IDN top-level domains</h2>
+
+	        <p>These domains were temporarily delegated by IANA for the <a href="http://www.icann.org/topics/idn/">IDN Evaluation</a> being conducted by <a href="http://www.icann.org/">ICANN</a>.</p>
+
+		<div class="iana-table-frame">
+		<table id="arpa-table" class="iana-table">
+			<thead>
+			<tr><th>Domain</th><th>Domain (A-label)</th><th>Language</th><th>Script</th></tr>
+			</thead>
+			<tbody>
+			<tr><td>&#1573;&#1582;&#1578;&#1576;&#1575;&#1585;</td><td><span class="domain label"><a href="/domains/root/db/xn--kgbechtv.html">XN--KGBECHTV</a></span></td>
+	<td>Arabic</td><td>Arabic</td></tr>
+	<tr><td>&#1570;&#1586;&#1605;&#1575;&#1740;&#1588;&#1740;</td><td><span class="domain label"><a href="/domains/root/db/xn--hgbk6aj7f53bba.html">XN--HGBK6AJ7F53BBA</a></span></td>
+	<td>Persian</td><td>Arabic</td></tr>
+	<tr><td>&#27979;&#35797;</td><td><span class="domain label"><a href="/domains/root/db/xn--0zwm56d.html">XN--0ZWM56D</a></span></td>
+	<td>Chinese</td><td>Han (Simplified variant)</td></tr>
+	<tr><td>&#28204;&#35430;</td><td><span class="domain label"><a href="/domains/root/db/xn--g6w251d.html">XN--G6W251D</a></span></td>
+	<td>Chinese</td><td>Han (Traditional variant)</td></tr>
+	<tr><td>&#1080;&#1089;&#1087;&#1099;&#1090;&#1072;&#1085;&#1080;&#1077;</td><td><span class="domain label"><a href="/domains/root/db/xn--80akhbyknj4f.html">XN--80AKHBYKNJ4F</a></span></td>
+	<td>Russian</td><td>Cyrillic</td></tr>
+	<tr><td>&#2346;&#2352;&#2368;&#2325;&#2381;&#2359;&#2366;</td><td><span class="domain label"><a href="/domains/root/db/xn--11b5bs3a9aj6g.html">XN--11B5BS3A9AJ6G</a></span></td>
+	<td>Hindi</td><td>Devanagari (Nagari)</td></tr>
+	<tr><td>&#948;&#959;&#954;&#953;&#956;&#942;</td><td><span class="domain label"><a href="/domains/root/db/xn--jxalpdlp.html">XN--JXALPDLP</a></span></td>
+	<td>Greek, Modern (1453-)</td><td>Greek</td></tr>
+	<tr><td>&#53580;&#49828;&#53944;</td><td><span class="domain label"><a href="/domains/root/db/xn--9t4b11yi5a.html">XN--9T4B11YI5A</a></span></td>
+	<td>Korean</td><td>Hangul (Hang&#x16D;l, Hangeul)</td></tr>
+	<tr><td>&#1496;&#1506;&#1505;&#1496;</td><td><span class="domain label"><a href="/domains/root/db/xn--deba0ad.html">XN--DEBA0AD</a></span></td>
+	<td>Yiddish</td><td>Hebrew</td></tr>
+	<tr><td>&#12486;&#12473;&#12488;</td><td><span class="domain label"><a href="/domains/root/db/xn--zckzah.html">XN--ZCKZAH</a></span></td>
+	<td>Japanese</td><td>Katakana</td></tr>
+	<tr><td>&#2986;&#2992;&#3007;&#2975;&#3021;&#2970;&#3016;</td><td><span class="domain label"><a href="/domains/root/db/xn--hlcj6aya9esc7a.html">XN--HLCJ6AYA9ESC7A</a></span></td>
+	<td>Tamil</td><td>Tamil</td></tr></tbody>
+		</table>
+	        </div>
+
+	<h2>Policy-reserved domains</h2>
+	
+	<p>We act as both the registrant and registrar for a select number of domains
+		which have been reserved under policy grounds. These exclusions are
+		typically indicated in either technical standards (RFC documents),
+		or <a href="http://www.icann.org/en/registries/agreements.htm">contractual limitations</a>.</p>
+		
+		<p>Domains which are described as registered to IANA or ICANN on policy
+		grounds are not available for registration or transfer, with the exception
+		of <span class="domain label"><i>country-name</i>.info</span> domains. These domains are available for release
+		by the ICANN Governmental Advisory Committee Secretariat.</p>
+
+    <h2>Other Special-Use Domains</h2>
+
+    <p>There is additionally a <a href="/assignments/special-use-domain-names">Special-Use Domain Names</a> registry documenting special-use domains designated by technical standards. For further information, see <a href="/go/rfc6761">Special-Use Domain Names</a> (RFC 6761).</p>
+	
+
+			</div>
+			
+			<div id="sidebar_left">
+				<div class="navigation_box">
+				<h2>Domain Names</h2>
+				<ul>
+					<li id="nav_dom_top"><a href="/domains">Overview</a></li>
+					<li id="nav_dom_root"><a href="/domains/root">Root Zone Management</a></li>
+					<ul id="nav_dom_root_sub">
+						<li id="nav_dom_root_top"><a href="/domains/root">Overview</a></li>
+						<li id="nav_dom_root_db"><a href="/domains/root/db">Root Database</a></li>
+						<li id="nav_dom_root_files"><a href="/domains/root/files">Hint and Zone Files</a></li>
+						<li id="nav_dom_root_manage"><a href="/domains/root/manage">Change Requests</a></li>
+						<li id="nav_dom_root_procedures"><a href="/domains/root/help">Instructions &amp; Guides</a></li>
+						<li id="nav_dom_root_servers"><a href="/domains/root/servers">Root Servers</a></li>
+					</ul>
+					<li id="nav_dom_int"><a href="/domains/int">.INT Registry</a></li>
+					<ul id="nav_dom_int_sub">
+						<li id="nav_dom_int_top"><a href="/domains/int">Overview</a></li>
+						<li id="nav_dom_int_manage"><a href="/domains/int/manage">Register/modify an .INT domain</a></li>
+						<li id="nav_dom_int_policy"><a href="/domains/int/policy">Eligibility</a></li>
+					</ul>
+					<li id="nav_dom_arpa"><a href="/domains/arpa">.ARPA Registry</a></li>
+					<li id="nav_dom_idn"><a href="/domains/idn-tables">IDN Practices Repository</a></li>
+					<ul id="nav_dom_idn_sub">
+						<li id="nav_dom_idn_top"><a href="/domains/idn-tables">Overview</a></li>
+						<!-- <li id="nav_dom_idn_tables"><a href="/domains/idn-tables/db">Tables</a></li> -->
+						<li id="nav_dom_idn_submit"><a href="/procedures/idn-repository.html">Submit a table</a></li>
+					</ul>
+					<li id="nav_dom_dnssec"><a href="/dnssec">Root Key Signing Key (DNSSEC)</a></li>
+					<ul id="nav_dom_dnssec_sub">
+						<li id="nav_dom_dnssec_top"><a href="/dnssec">Overview</a></li>
+						<li id="nav_dom_dnssec_ksk"><a href="/dnssec/files">Trusts Anchors and Keys</a></li>
+						<li id="nav_dom_dnssec_ceremonies"><a href="/dnssec/ceremonies">Root KSK Ceremonies</a></li>
+						<li id="nav_dom_dnssec_dps"><a href="/dnssec/dps">Practice Statement</a></li>
+                        <li id="nav_dom_dnssec_tcrs"><a href="/dnssec/tcrs">Community Representatives</a></li>
+					</ul>
+					<li id="nav_dom_special"><a href="/domains/reserved">Reserved Domains</a></li>
+				</ul>
+				</div>
+			</div>
+			
+
+	</div>
+
+	<footer>
+		<div id="footer">
+			<table class="navigation">
+				<tr>
+					<td class="section"><a href="/domains">Domain&nbsp;Names</a></td>
+					<td class="subsection">
+						<ul>
+							<li><a href="/domains/root">Root Zone Registry</a></li>
+							<li><a href="/domains/int">.INT Registry</a></li>
+							<li><a href="/domains/arpa">.ARPA Registry</a></li>
+							<li><a href="/domains/idn-tables">IDN Repository</a></li>
+						</ul>
+					</td>
+				</tr>
+				<tr>
+					<td class="section"><a href="/numbers">Number&nbsp;Resources</a></td>
+					<td class="subsection">
+						<ul>
+							<li><a href="/abuse">Abuse Information</a></li>
+						</ul>
+					</td>
+				</tr>
+				<tr>
+					<td class="section"><a href="/protocols">Protocols</a></td>
+					<td class="subsection">
+						<ul>
+							<li><a href="/protocols">Protocol Registries</a></li>
+							<li><a href="/time-zones">Time Zone Database</a></li>
+						</ul>
+					</td>
+				</tr>
+				<tr>
+					<td class="section"><a href="/about">About&nbsp;Us</a></td>
+					<td class="subsection">
+						<ul>
+							<li><a href="/about/presentations">Presentations</a></li>
+							<li><a href="/reports">Reports</a></li>
+                            <li><a href="/performance">Performance</a></li>
+							<li><a href="/reviews">Reviews</a></li>
+                            <li><a href="/about/excellence">Excellence</a></li>
+							<li><a href="/contact">Contact Us</a></li>
+						</ul>
+					</td>
+				</tr>
+			</table>
+
+            <div id="custodian">
+                <p>The IANA functions coordinate the Internet’s globally unique identifiers, and
+                    are provided by <a href="http://pti.icann.org">Public Technical Identifiers</a>, an affiliate of
+                    <a href="http://www.icann.org/">ICANN</a>.</p>
+            </div>
+
+            <div id="legalnotice">
+            <ul>
+                <li><a href="https://www.icann.org/privacy/policy">Privacy Policy</a></li>
+                <li><a href="https://www.icann.org/privacy/tos">Terms of Service</a></li>
+                </ul>
+                </p>
+            </div>
+
+		</div>
+	</footer>
+	
+	
+<script>
+$(document).ready(function() {
+	$("#nav_dom_special").addClass("selected")
+	$("#nav_dom_int_sub").hide()
+	$("#nav_dom_idn_sub").hide()
+	$("#nav_dom_dnssec_sub").hide()
+	$("#nav_dom_tools_sub").hide()
+	$("#nav_dom_root_sub").hide()
+});
+</script>
+
+	
+</body>
+
+</html>
diff --git "a/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt" "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt"
new file mode 100644
index 0000000..1aab856
--- /dev/null
+++ "b/tests/test_webpages/IANA \342\200\224 IANA-managed Reserved Domains.txt"	
@@ -0,0 +1,105 @@
+IANA — IANA-managed Reserved Domains
+
+Domains
+Numbers
+Protocols
+About Us
+
+IANA-managed Reserved Domains
+
+Certain domains are set aside, and nominally registered to “IANA”, for specific policy or technical purposes.
+
+Example domains
+
+As described in RFC 2606 and RFC 6761, a number of domains such as example.com and example.org are maintained for documentation purposes. These domains may be used as illustrative examples in documents without prior coordination with us. They are not available for registration or transfer.
+
+Test IDN top-level domains
+
+These domains were temporarily delegated by IANA for the IDN Evaluation being conducted by ICANN.
+
+Domain Domain (A-label) Language Script
+إختبار XN--KGBECHTV Arabic Arabic
+آزمایشی XN--HGBK6AJ7F53BBA Persian Arabic
+测试 XN--0ZWM56D Chinese Han (Simplified variant)
+測試 XN--G6W251D Chinese Han (Traditional variant)
+испытание XN--80AKHBYKNJ4F Russian Cyrillic
+परीक्षा XN--11B5BS3A9AJ6G Hindi Devanagari (Nagari)
+δοκιμή XN--JXALPDLP Greek, Modern (1453-) Greek
+테스트 XN--9T4B11YI5A Korean Hangul (Hangŭl, Hangeul)
+טעסט XN--DEBA0AD Yiddish Hebrew
+テスト XN--ZCKZAH Japanese Katakana
+பரிட்சை XN--HLCJ6AYA9ESC7A Tamil Tamil
+
+Policy-reserved domains
+
+We act as both the registrant and registrar for a select number of domains which have been reserved under policy grounds. These exclusions are typically indicated in either technical standards (RFC documents), or contractual limitations.
+
+Domains which are described as registered to IANA or ICANN on policy grounds are not available for registration or transfer, with the exception of country-name.info domains. These domains are available for release by the ICANN Governmental Advisory Committee Secretariat.
+
+Other Special-Use Domains
+
+There is additionally a Special-Use Domain Names registry documenting special-use domains designated by technical standards. For further information, see Special-Use Domain Names (RFC 6761).
+
+Domain Names
+
+Overview
+Root Zone Management
+
+Overview
+Root Database
+Hint and Zone Files
+Change Requests
+Instructions & Guides
+Root Servers
+
+.INT Registry
+
+Overview
+Register/modify an .INT domain
+Eligibility
+
+.ARPA Registry
+IDN Practices Repository
+
+Overview
+Submit a table
+
+Root Key Signing Key (DNSSEC)
+
+Overview
+Trusts Anchors and Keys
+Root KSK Ceremonies
+Practice Statement
+Community Representatives
+
+Reserved Domains
+
+Domain Names
+
+Root Zone Registry
+.INT Registry
+.ARPA Registry
+IDN Repository
+
+Number Resources
+
+Abuse Information
+
+Protocols
+
+Protocol Registries
+Time Zone Database
+
+About Us
+
+Presentations
+Reports
+Performance
+Reviews
+Excellence
+Contact Us
+
+The IANA functions coordinate the Internet’s globally unique identifiers, and are provided by Public Technical Identifiers, an affiliate of ICANN.
+
+Privacy Policy
+Terms of Service
\ No newline at end of file
diff --git a/tests/test_webpages/Scrapinghub Enterprise Solutions.html b/tests/test_webpages/Scrapinghub Enterprise Solutions.html
new file mode 100644
index 0000000..c9b02f2
--- /dev/null
+++ b/tests/test_webpages/Scrapinghub Enterprise Solutions.html	
@@ -0,0 +1,3 @@
+<!DOCTYPE html><html lang="en"><head> <script>!function(e,t,a,n,r){e[n]=e[n]||[],e[n].push({"gtm.start":(new Date).getTime(),event:"gtm.js"});var g=t.getElementsByTagName(a)[0],m=t.createElement(a),s="dataLayer"!=n?"&l="+n:"";m.async=!0,m.src="https://www.googletagmanager.com/gtm.js?id="+r+s,g.parentNode.insertBefore(m,g)}(window,document,"script","dataLayer","GTM-TK3JGJ6")</script><link href='https://fonts.googleapis.com/css?family=Open+Sans:300,400,600,700,400italic,600italic,700italic' rel='stylesheet' type='text/css'><link rel="stylesheet" type="text/css" href="/assets/main-c89eed224447099e7349bf80a1b29546179ec5144ce2bb54d1d545ee82c81539.css" integrity="sha256-yJ7tIkRHCZ5zSb+AobKVRheexRRM4rtU0dVF7oLIFTk=" crossorigin="anonymous"><link rel='shortcut icon' type='image/x-icon' href='favicon.ico'/><link rel="apple-touch-icon" href="apple-icon-180x180.png"><link rel="icon" type="image/png" href="favicon-96x96.png"><meta name="msapplication-TileColor" content="#ffffff"><meta name="msapplication-TileImage" content="ms-icon-144x144.png"><title>Scrapinghub Enterprise Solutions</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/><meta name="Author" content="Scrapinghub"/><meta name="Publisher" content="Scrapinghub"/><meta name="Copyright" content="Scrapinghub"/><meta name="Robots" content="ALL"/><meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0"/><meta name="twitter:card" content="summary"><meta name="twitter:site" content="@scrapinghub"><meta name="twitter:creator" content="@scrapinghub"><meta name="twitter:title" content="Scrapinghub: Scrapinghub Enterprise Solutions"><meta name="twitter:url" content="/enterprise-solutions/"><meta name="twitter:description" content="Leading Technology and Professional Services to deliver successful web crawling and data processing solutions.
+"><meta content="Scrapinghub" property="og:site_name"><meta content="Scrapinghub: Scrapinghub Enterprise Solutions" property="og:title"><meta content="article" property="og:type"><meta content="Leading Technology and Professional Services to deliver successful web crawling and data processing solutions.
+" property="og:description"><meta content="/enterprise-solutions/" property="og:url"><meta content="" property="og:image"><meta name="twitter:image:src" content=""></head><body> <noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-TK3JGJ6" height="0" width="0" style="display:none;visibility:hidden"></iframe> </noscript> <header class="site-header" role="banner"><div class="wrap container-fluid"><div class="row middle-md site-nav-wrapper"><div class="col-md-2 col-xs-12 scrapinghub-logo-container"> <a href="/" class="scrapinghub-logo"><?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 122 65"> <style>.st0{fill:#d32f2f}.st1{fill:#231f20}</style><path class="st0" d="M9.6 22.2c.1.6.4 1.8.8 2.9.2.4.4.9.6 1.3.3.5.6 1.1.9 1.5 1.3-1.5 3.2-2.5 5.3-2.5 0-.4.1-1.2.2-2.2.1-.8.2-1.5.4-2.3.2-.6.4-1.2.6-1.7.2-.6.5-1.1.8-1.7 1.4-2.5 3.4-4.6 5.9-6.1 2.9-1.7 6.3-2.4 9.6-2 .3 1.2 1.3 2 2.5 2 1.4 0 2.6-1.2 2.6-2.6s-1.2-2.6-2.6-2.6c-.9 0-1.7.4-2.1 1.1-3.8-.5-7.7.3-11 2.2-4.1 2.4-7 6.2-8.2 10.7-.3 1.2-.5 2.4-.6 3.5-1 .2-1.9.6-2.8 1.2-.3-.7-.6-1.5-.8-2.2-1.1-4-.5-8.2 1.6-11.8C15 8 17.5 5.7 20.6 4.4c.5.5 1.2.8 1.9.8 1.4 0 2.6-1.2 2.6-2.6S23.9 0 22.5 0 20 1.1 19.9 2.4c-3.6 1.5-6.4 4-8.4 7.4-1.5 2.7-2.4 5.7-2.4 8.8.2 1.4.3 2.5.5 3.6"/> <path class="st0" d="M46.9 40.9c-1.4 0-2.6 1.2-2.6 2.6 0 .3 0 .5.1.8-2.7 2-5.9 3.1-9.3 3.1-5.4 0-10.4-2.8-13.2-7.3 2.6-1.6 4.2-4.4 4.2-7.5s-1.6-5.9-4.2-7.5c2.8-4.5 7.9-7.3 13.2-7.3 3.4 0 6.6 1.1 9.3 3.1-.1.2-.1.5-.1.8 0 1.4 1.2 2.6 2.6 2.6 1.4 0 2.6-1.2 2.6-2.6s-1.2-2.6-2.6-2.6c-.4 0-.8.1-1.1.3-3.1-2.3-6.7-3.6-10.6-3.6-6.1 0-11.8 3.2-15 8.4-.2.4-.4 1-.6 1.7 2.9.9 5 3.6 5 6.8s-2.1 5.9-5 6.8c.1.7.4 1.3.6 1.7 3.2 5.2 8.9 8.4 15 8.4 3.9 0 7.5-1.2 10.6-3.6.3.2.7.3 1.1.3 1.4 0 2.6-1.2 2.6-2.6 0-1.6-1.1-2.8-2.6-2.8M6.9 57.5c-.3 0-.5 0-.8.1-2-2.7-3.1-5.9-3.1-9.3 0-3.2 1-6.2 2.7-8.8.9-1.4 2.1-2.6 3.5-3.6.6-.4 1.1-.7 1.4-.8-.3-.8-.5-1.7-.5-2.6 0-.9.2-1.8.5-2.6-.6-.2-1.6-.8-2.8-2-.7-.7-1.4-1.4-1.9-2.2-.1-.1-.1-.2-.2-.3-1.8-2.6-2.8-5.6-2.8-8.8C2.9 13.2 4 10 6 7.3c.2.1.5.1.8.1 1.4 0 2.6-1.2 2.6-2.6S8.2 2.2 6.8 2.2 4.2 3.4 4.2 4.8c0 .4.1.8.3 1.1C2.2 9.1 1 12.8 1 16.6c0 5.8 2.9 11.2 7.7 14.5-.1.4-.1.9-.1 1.3s0 .9.1 1.3C3.9 37.1 1 42.5 1 48.4c0 3.9 1.2 7.5 3.6 10.6-.2.3-.3.7-.3 1.1 0 1.4 1.2 2.6 2.6 2.6s2.6-1.2 2.6-2.6-1.2-2.6-2.6-2.6"/> <path class="st0" d="M37.3 53.5c-1.2 0-2.3.9-2.5 2-3.3.4-6.7-.3-9.6-2-2.5-1.5-4.5-3.5-5.9-6-.3-.5-.6-1.1-.8-1.7-.2-.5-.4-1.1-.6-1.7-.2-.7-.3-1.5-.4-2.3-.2-1-.2-1.8-.2-2.2-2.1 0-4-1-5.3-2.5-1.3 1.6-2.1 4.6-2.3 5.8-.2 1.1-.3 2.2-.3 3.3 0 3.1.8 6.1 2.4 8.8 1.9 3.4 4.8 5.9 8.4 7.4.1 1.3 1.2 2.4 2.6 2.4s2.6-1.2 2.6-2.6c0-1.4-1.2-2.6-2.6-2.6-.7 0-1.4.3-1.9.8-3.1-1.3-5.6-3.6-7.3-6.5-2.1-3.6-2.6-7.8-1.6-11.8.2-.7.5-1.5.8-2.2.8.5 1.8.9 2.7 1.2.1 1.2.3 2.4.6 3.5 1.2 4.5 4.1 8.3 8.2 10.7 3.3 1.9 7.2 2.7 11 2.2.5.7 1.3 1.1 2.1 1.1 1.4 0 2.6-1.2 2.6-2.6-.1-1.3-1.3-2.5-2.7-2.5"/> <path class="st1" d="M23.1 32.5c0 3.2-2.6 5.7-5.7 5.7-3.2 0-5.7-2.6-5.7-5.7 0-3.2 2.6-5.7 5.7-5.7 3.2 0 5.7 2.5 5.7 5.7M42.9 28.9l-.8 1.5c-.2-.2-1.1-.7-2.2-.7-.7 0-1.3.3-1.3.7 0 .6.8.7 2.2 1.1s2.5 1.2 2.5 2.6c0 1.3-1.2 2.7-3.7 2.7-1.9 0-3.2-.9-3.5-1.1l.9-1.6c.6.4 1.5.9 2.6.9 1 0 1.6-.3 1.6-.8 0-.6-1.1-.8-2.2-1.2-1.2-.4-2.5-.9-2.5-2.3 0-1.5 1.5-2.5 3.5-2.5 1.4-.1 2.5.5 2.9.7M51.2 29.3L50 30.8c-.2-.2-.8-.7-1.9-.7-1.4 0-2.3 1.1-2.3 2.3 0 1.5 1 2.4 2.4 2.4 1.2 0 1.9-.8 1.9-.8l1.2 1.5c-.1.1-1.1 1.3-3.4 1.3-2.5 0-4.4-1.7-4.4-4.3 0-2.5 1.9-4.3 4.3-4.3 2.2-.1 3.3 1.1 3.4 1.1M53.9 28.4V30c.2-.9 1.1-1.9 2.5-1.9h.6l-.1 2h-.3c-1.5 0-2.5.4-2.5 1.8v4.6H52v-8.2h1.9v.1zM57.8 28.9c.2-.2 1.5-.8 2.9-.8 2.4 0 3.3 1.3 3.3 2.7v3.9c0 1.2.2 1.5.3 1.8l-1.8.3c-.1-.3-.2-.5-.2-.8-.2.2-.8.9-2.2.9-1.9 0-3-1.5-3-2.9 0-1.9 1.3-3 3.1-3 1.1 0 1.8.6 1.9.6v-.1c0-.8-.3-1.5-1.5-1.5-1 0-1.8.5-2 .6l-.8-1.7zm2.8 6.2c1 0 1.6-.7 1.6-1.3s-.5-1.3-1.6-1.3-1.5.7-1.5 1.3c0 .6.4 1.3 1.5 1.3M65.1 39.5V30c0-.6-.1-1-.2-1.1.3-.1 1.4-.8 3.6-.8 3 0 4.9 1.8 4.9 4.5 0 2.4-1.6 4.2-4.1 4.2-1.2 0-2-.4-2.1-.5v3.2h-2.1zm3.5-9.5c-.9 0-1.3.2-1.4.2V34c0 .5.7.8 1.6.8 1.3 0 2.5-.6 2.5-2.4 0-1.7-1.4-2.4-2.7-2.4M75.3 24.7c.7 0 1.3.5 1.3 1.3 0 .7-.6 1.3-1.3 1.3-.7 0-1.3-.5-1.3-1.3 0-.7.6-1.3 1.3-1.3m1.1 11.9h-2.1v-8.2h2.1v8.2zM85 36.6h-2.1v-4.9c0-1-.5-1.6-1.8-1.6-.8 0-1.1.1-1.4.2v6.4h-2.1V30c0-.9 0-.8-.3-1.1.6-.2 1.6-.6 3.8-.6 2.3 0 3.8 1.1 3.8 3.4l.1 4.9zM86.8 39.5l.8-1.7c.1.1.9.7 2.4.7 1.6 0 2.1-.7 2.1-1.8V36c-.1.1-.9.7-2.3.7-2.2 0-4.1-1.6-4.1-4.3 0-2.2 1.9-4.3 4.8-4.3 2.2 0 3.2.4 3.7.6-.2.2-.2.2-.2 1.2v7c0 1.6-1 3.4-4.1 3.4-1.7 0-3-.7-3.1-.8m5.2-6v-3.2c-.1-.1-.7-.2-1.6-.2-1.2 0-2.5.8-2.5 2.4 0 1.4 1 2.3 2.2 2.3 1.2 0 1.9-.6 1.9-1.3"/> <path class="st0" d="M97.4 25.3V29c.3-.2 1.1-.9 2.5-.9 2.4 0 2.9 1.7 2.9 2.9v5.6h-2.1v-5.4c0-.7-.4-1.2-1.5-1.2-1 0-1.7.6-1.8.8v5.8h-2.1V25.3h2.1zM111.4 34.4c0 1.5.2 1.9.3 2.1l-2 .3c-.2-.4-.2-.9-.3-1.3-.2.3-.7 1.4-2.6 1.4-2 0-2.8-1.5-2.8-3v-5.4h2.1v5.1c0 1 .5 1.5 1.5 1.5.9 0 1.8-.6 1.8-1.6v-4.9h2v5.8zM114.7 25.3v3.3c.1-.1.9-.5 2.1-.5 2.5 0 4.1 1.9 4.1 4.2 0 2.9-2.2 4.4-5 4.4-1.6 0-2.4-.2-3.6-.7.2-.2.2-.8.2-1.1v-9.6h2.2zm1.8 4.7c-1.1 0-1.5.3-1.8.4v3.8c0 .4.6.6 1.4.6 1.7 0 2.7-.7 2.7-2.6 0-1-1-2.2-2.3-2.2"/> </svg> </a></div><div class="col-md-10 col-xs-12 end-md site-nav-container"> <nav class="site-nav"><div class="nav-item-wrapper main-nav-item"><div class="nav-item-link-wrapper"> <a href="/enterprise-solutions" class="current nav-item-link"> <span class="nav-item-link-label"> Enterprise Solutions </span> </a></div></div><div class="nav-item-wrapper main-nav-item with-dropdown"><div class="nav-item-link-wrapper"> <a href="#" class="nav-item-link"> <span class="nav-item-link-label"> Products <img alt="" integrity="sha256-X7oVRqgRqdwXU20LOtwO1vSWSUZtXVTfVjYzOOc5eOI=" crossorigin="anonymous" src="/assets/fa-angle-down-red-5fba1546a811a9dc17536d0b3adc0ed6f49649466d5d54df56363338e73978e2.png"> </span> </a></div><ul class="dropdown-content"><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/data-on-demand/" class="dropdown-item-link"><h1>Data on Demand</h1><p> Turn web content into useful data for your business</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/crawlera/" class="dropdown-item-link"><h1>Crawlera Smart Proxy</h1><p> A smart proxy that never gets banned and doesn't need IP rotation</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/enterprise-solutions/" class="dropdown-item-link"><h1>Professional Services</h1><p> The most experienced team from the market leaders in web scraping</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/scrapy-cloud/" class="dropdown-item-link"><h1>Scrapy Cloud</h1><p> Deploy and manage your Scrapy spiders with your web scraping team</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="https://learn.scrapinghub.com/scrapy/" class="dropdown-item-link"><h1>Scrapy Training</h1><p> Get your team trained on Scrapy, by the team that create Scrapy itself</p> </a></div></li></ul></div><div class="nav-item-wrapper main-nav-item with-dropdown"><div class="nav-item-link-wrapper"> <a href="#" class="nav-item-link"> <span class="nav-item-link-label"> Developer Tools <img alt="" integrity="sha256-X7oVRqgRqdwXU20LOtwO1vSWSUZtXVTfVjYzOOc5eOI=" crossorigin="anonymous" src="/assets/fa-angle-down-red-5fba1546a811a9dc17536d0b3adc0ed6f49649466d5d54df56363338e73978e2.png"> </span> </a></div><ul class="dropdown-content"><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/scrapy-cloud/" class="dropdown-item-link"><h1>Scrapy Cloud</h1><p> Deploy and manage your Scrapy spiders with your web scraping team</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/crawlera/" class="dropdown-item-link"><h1>Crawlera Smart Proxy</h1><p> A smart proxy that never gets banned and doesn't need IP rotation</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/splash/" class="dropdown-item-link"><h1>Splash</h1><p> A full blown browser behind an API, to render pages and execute actions</p> </a></div></li></ul></div><div class="nav-item-wrapper main-nav-item with-dropdown"><div class="nav-item-link-wrapper"> <a href="/pricing" class="nav-item-link"> <span class="nav-item-link-label"> Pricing <span class="hide-mobile"><img alt="" integrity="sha256-X7oVRqgRqdwXU20LOtwO1vSWSUZtXVTfVjYzOOc5eOI=" crossorigin="anonymous" src="/assets/fa-angle-down-red-5fba1546a811a9dc17536d0b3adc0ed6f49649466d5d54df56363338e73978e2.png"></span> </span> </a></div><ul class="dropdown-content single-column"><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/data-on-demand/#solutions" class="dropdown-item-link"><h1>Data on Demand</h1> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/crawlera/#section-pricing" class="dropdown-item-link"><h1>Crawlera</h1> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/scrapy-cloud/#pricing" class="dropdown-item-link"><h1>Scrapy Cloud</h1> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/splash/#pricing" class="dropdown-item-link"><h1>Splash</h1> </a></div></li></ul></div><div class="nav-item-wrapper right-aligned"><div class="nav-item-link-wrapper"> <a href="https://app.scrapinghub.com/account/login/" class="quote nav-item-link">Sign In</a></div></div></nav></div></div><div class="responsive-menu"> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="40" height="40" viewBox="0 0 40 40"><title>hamburger</title> <rect class="a" width="40" height="8"/> <rect class="a" y="16" width="40" height="8"/> <rect class="a" y="32" width="40" height="8"/> </svg></div><div class="site-nav-mobile col-xs-12"> <nav class=""><div class="nav-item-wrapper main-nav-item"><div class="nav-item-link-wrapper"> <a href="/enterprise-solutions" class="current nav-item-link"> <span class="nav-item-link-label"> Enterprise Solutions </span> </a></div></div><div class="nav-item-wrapper main-nav-item with-dropdown"><div class="nav-item-link-wrapper"> <a href="#" class="nav-item-link"> <span class="nav-item-link-label"> Products <img alt="" integrity="sha256-X7oVRqgRqdwXU20LOtwO1vSWSUZtXVTfVjYzOOc5eOI=" crossorigin="anonymous" src="/assets/fa-angle-down-red-5fba1546a811a9dc17536d0b3adc0ed6f49649466d5d54df56363338e73978e2.png"> </span> </a></div><ul class="dropdown-content"><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/data-on-demand/" class="dropdown-item-link"><h1>Data on Demand</h1><p> Turn web content into useful data for your business</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/crawlera/" class="dropdown-item-link"><h1>Crawlera Smart Proxy</h1><p> A smart proxy that never gets banned and doesn't need IP rotation</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/enterprise-solutions/" class="dropdown-item-link"><h1>Professional Services</h1><p> The most experienced team from the market leaders in web scraping</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/scrapy-cloud/" class="dropdown-item-link"><h1>Scrapy Cloud</h1><p> Deploy and manage your Scrapy spiders with your web scraping team</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="https://learn.scrapinghub.com/scrapy/" class="dropdown-item-link"><h1>Scrapy Training</h1><p> Get your team trained on Scrapy, by the team that create Scrapy itself</p> </a></div></li></ul></div><div class="nav-item-wrapper main-nav-item with-dropdown"><div class="nav-item-link-wrapper"> <a href="#" class="nav-item-link"> <span class="nav-item-link-label"> Developer Tools <img alt="" integrity="sha256-X7oVRqgRqdwXU20LOtwO1vSWSUZtXVTfVjYzOOc5eOI=" crossorigin="anonymous" src="/assets/fa-angle-down-red-5fba1546a811a9dc17536d0b3adc0ed6f49649466d5d54df56363338e73978e2.png"> </span> </a></div><ul class="dropdown-content"><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/scrapy-cloud/" class="dropdown-item-link"><h1>Scrapy Cloud</h1><p> Deploy and manage your Scrapy spiders with your web scraping team</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/crawlera/" class="dropdown-item-link"><h1>Crawlera Smart Proxy</h1><p> A smart proxy that never gets banned and doesn't need IP rotation</p> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/splash/" class="dropdown-item-link"><h1>Splash</h1><p> A full blown browser behind an API, to render pages and execute actions</p> </a></div></li></ul></div><div class="nav-item-wrapper main-nav-item with-dropdown"><div class="nav-item-link-wrapper"> <a href="/pricing" class="nav-item-link"> <span class="nav-item-link-label"> Pricing <span class="hide-mobile"><img alt="" integrity="sha256-X7oVRqgRqdwXU20LOtwO1vSWSUZtXVTfVjYzOOc5eOI=" crossorigin="anonymous" src="/assets/fa-angle-down-red-5fba1546a811a9dc17536d0b3adc0ed6f49649466d5d54df56363338e73978e2.png"></span> </span> </a></div><ul class="dropdown-content single-column"><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/data-on-demand/#solutions" class="dropdown-item-link"><h1>Data on Demand</h1> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/crawlera/#section-pricing" class="dropdown-item-link"><h1>Crawlera</h1> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/scrapy-cloud/#pricing" class="dropdown-item-link"><h1>Scrapy Cloud</h1> </a></div></li><li class="dropdown-item"><div class="dropdown-item-link-wrapper"> <a href="/splash/#pricing" class="dropdown-item-link"><h1>Splash</h1> </a></div></li></ul></div><div class="nav-item-wrapper right-aligned"><div class="nav-item-link-wrapper"> <a href="https://app.scrapinghub.com/account/login/" class="quote nav-item-link">Sign In</a></div></div></nav></div></div><div id="shapes-container"></div></header><div class="page-enterprise"><div class="page-header enterprise page-header-hero"><div class="wrap container-fluid"><h1 id="enterprise-solutions">Enterprise <em>Solutions</em></h1><h2 id="complete-web-scraping-services-for-any-size-business-from-startups-to-fortune-100s">Complete web scraping services for any size business, from startups to Fortune 100’s</h2><p><a href="/enterprise-consultation">Tell us about your project</a></p></div><picture> <source srcset="/assets/web-scraping-solutions-mobile-320.mini-67b6287ca073c734ac93c97bc764b4f26a1ba70bbb02cde36ee61accca4238cb.svg" media="(max-width: 320px)"> <source srcset="/assets/web-scraping-solutions-mobile-500.mini-462c965dea296ee640995c072696b3a7c580db79c1a7abd879ef2b8949e1b754.svg" media="(max-width: 500px)"> <img alt="" integrity="sha256-FrbZP1v7V/Ttxm3+WwEMN1e3boR0c1+YJSlAyCJ8EZ4=" crossorigin="anonymous" src="/assets/web-scraping-solutions.mini-16b6d93f5bfb57f4edc66dfe5b010c3757b76e8474735f98252940c8227c119e.svg"> </picture></div></div><div class="page-enterprise"><div class="content-container solutions-section"><div class="solutions-row"><div class="flex-container"><div class="left flex-item"><div><h3 id="web-data-hassle-free-for-real-business-needs"><em>Web data</em>, hassle-free, for real <em>business</em> needs</h3><p><a href="/enterprise-consultation">Get a free consultation</a></p><p>From the world leading experts in web scraping</p></div></div><div class="right flex-item"><div class="flex-sol-content"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="45px" height="45px"> <style>.ld0{display:none}.ld1{display:inline}.ld2{fill:#ea7f7f}.ld4{fill:#3f5363}.ld5{fill:#d32f2f}.ld7,.ld8{fill:none;stroke:#37474f;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10}.ld8,.ld9{stroke-width:2}.ld10,.ld9{display:inline;fill:none;stroke:#37474f;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10}</style><g id="lead-gen" class="ld0"> <g id="funnel-fill" class="ld1"> <path id="bottom-fill" class="ld2" d="M22.6 30.4v10.5L20.2 39v-8.6z"/> <path id="mid-fill" class="ld2" d="M32.7 18.9l-9.6 9.7h-3.4l-9.6-9.7z"/> <path id="top-fill" class="ld2" d="M8.8 15.5h25.1v1.6H8.8z"/> <circle id="circle-left-path" class="ld2" cx="13.8" cy="9.3" r="1.9"/> <circle id="circle-right-path" class="ld2" cx="31" cy="9.3" r="1.9"/> </g> <path id="funnel-outline" d="M36.6 13.4H8.4c-.5 0-.9.4-.9.9v3.6c0 .2.1.5.2.6l11.7 11.8v10.1c0 .3.1.5.3.7l4.4 3.5c.2.1.4.2.6.2.1 0 .3 0 .4-.1.3-.2.5-.5.5-.8V30.3l11.7-11.8c.2-.2.2-.4.2-.6v-3.6c.1-.5-.4-.9-.9-.9zM23.8 30.8v11.1l-2.5-2v-9.1h2.5zm10.6-12L24.3 29h-3.6L10.6 18.8h23.8zm1.3-3.6v1.7H9.3v-1.7h26.4z" fill="#3f5363"/> <g id="right-hole" class="ld1"> <path id="rh-compound-path" class="ld4" d="M31.1 12.5c-1.8 0-3.2-1.5-3.2-3.2S29.3 6 31.1 6c1.8 0 3.2 1.5 3.2 3.2s-1.4 3.3-3.2 3.3zm0-5.2c-1.1 0-2 .9-2 2s.9 2 2 2 2-.9 2-2-.9-2-2-2z"/> </g> <g id="left-hole" class="ld1"> <path id="lh-compound-path" class="ld4" d="M13.9 12.5c-1.8 0-3.2-1.5-3.2-3.2S12.1 6 13.9 6s3.2 1.5 3.2 3.2-1.4 3.3-3.2 3.3zm0-5.2c-1.1 0-2 .9-2 2s.9 2 2 2 2-.9 2-2-.9-2-2-2z"/> </g> </g> <g id="svg"> <path class="ld5" d="M13.1 7c-1.2 0-2.2 1-2.2 2.2 0 1.2 1 2.2 2.2 2.2s2.2-1 2.2-2.2C15.4 8 14.4 7 13.1 7zM30.5 7c-1.2 0-2.2 1-2.2 2.2 0 1.2 1 2.2 2.2 2.2s2.2-1 2.2-2.2c0-1.2-1-2.2-2.2-2.2zM7.7 15.3h26.6v1.9H7.7zM9.1 19.2h23.8l-9.8 9.6h-4.2zM19.6 39.7v-8.9h2.8v10.8z"/> <path id="coin" d="M22.7 4.8v.8c.3 0 .4-.2.4-.4 0-.1 0-.2-.1-.2s-.2-.1-.3-.2zm-.8-1.1c0 .1 0 .2.1.2.1.1.2.1.3.2v-.7c-.1 0-.2.1-.3.1 0 .1-.1.2-.1.2zm.6-3.5c-2.4 0-4.4 2-4.4 4.4 0 2.4 2 4.4 4.4 4.4 2.4 0 4.4-1.9 4.4-4.4 0-2.4-2-4.4-4.4-4.4zm.9 5.6c-.2.2-.4.2-.7.3v.5h-.3v-.5c-.4 0-.7-.1-1-.2v-.5c.1.1.3.1.5.2.2 0 .4.1.5.1v-1l-.2-.1c-.3-.1-.5-.2-.6-.4-.1-.1-.2-.3-.2-.5s.1-.4.3-.6c.1-.1.3-.1.6-.2v-.4h.3v.4c.3 0 .6.1.9.2l-.2.4c-.3-.1-.5-.2-.8-.2v.9l.2.1c.3.1.5.2.6.4.1.1.2.3.2.5.2.3.1.4-.1.6z" fill="#37474f"/> <path id="coin-dollar" class="ld5" d="M23.3 4.6c-.1-.1-.3-.3-.6-.4l-.2-.1v-.8c.3 0 .5.1.8.2l.1-.5c-.3-.1-.6-.2-.9-.2v-.4h-.3v.4c-.3 0-.5.1-.7.3-.2.1-.3.3-.3.6 0 .2.1.4.2.5.1.1.3.3.6.4l.2.1v.9c-.1 0-.3 0-.5-.1s-.4-.1-.5-.2v.5c.3.1.6.2 1 .2v.5h.3V6c.3 0 .6-.1.7-.3.2-.2.3-.3.3-.6-.1-.2-.1-.3-.2-.5zm-1.2-.5c-.1-.1-.2-.2-.3-.2-.1-.1-.1-.1-.1-.2s0-.2.1-.2c.1-.1.2-.1.3-.1v.7zm.3 1.4v-.8c.2.1.3.1.3.2s.1.1.1.2c.1.3-.1.4-.4.4z"/> <circle class="ld7" cx="13.9" cy="9.3" r="2.7"/> <circle class="ld7" cx="31.2" cy="9.3" r="2.7"/> <path class="ld8" d="M8.2 18.2h28.7M20.2 29.8H25M22.4 14.3h14.5v3.9L25 29.8v13.8l-4.8-3.4V29.8l-12-11.6v-3.9h14.6"/> </g> <g id="original" class="ld0"> <path class="ld9" d="M8.2 18.2h28.7M20.2 29.8H25M22.4 14.3h14.5v3.9L25 29.8v13.8l-4.8-3.4V29.8l-12-11.6v-3.9h14.6"/> <circle class="ld10" cx="13.9" cy="9.3" r="2.7"/> <circle class="ld10" cx="31.2" cy="9.3" r="2.7"/> </g> </svg></div><h4>Lead generation, competitor &amp; sales intelligence</h4></div><div class="flex-sol-content"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" id="finance" width="45px" height="45px"> <style>.f0{fill:#d32f2f}.f1{fill:none;stroke:#37474f;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10}</style><path class="f0" d="M35.4 9.3h3.1V34h-3.1zM25.4 20.4h3.1V34h-3.1zM15.5 17.3h3.1V34h-3.1zM5.5 26.1h3.1V34H5.5z"/> <path class="f1" d="M3.3 35.8h38.4M5.3 35.3V26h5v9.3M15.3 35.3V17.2h5v18.1M25.3 35.3v-15h5v15M35.2 35.3V9.2h5v26.1"/> </svg></div><h4>Alternative data for finance, equity and market research</h4></div><div class="flex-sol-content"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="45px" height="45px"> <style>.dw2{fill:#37474f}.dw5{fill:none;stroke:#37474f;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10}</style><g id="dark-web"> <path id="thumb-shape" d="M22.5 8.1c-6.5 0-11.8 6.4-11.8 14.4S16 36.9 22.5 36.9s11.8-6.4 11.8-14.4S29 8.1 22.5 8.1zm0 18.2c-1.8 0-3.2-1.7-3.2-3.8s1.5-3.8 3.2-3.8c1.8 0 3.2 1.7 3.2 3.8s-1.4 3.8-3.2 3.8z" fill="#d32f2f"/> <ellipse id="thumb-stroke" cx="22.5" cy="22.5" rx="14.2" ry="17.4" fill="none" stroke="#d32f2f" stroke-miterlimit="10"/> <path class="dw5" d="M12.9 5.5s7.8-5.2 17.3 0M8.7 12s12.8-10.4 24.4.1M35 14.1s1.2 1.8 1.8 3.1"/> <path class="dw5" d="M8.2 23.4s.3-13 13.9-13.4h.1c.5 0 8 .4 11.6 8.9 0 .1.1.1.1.2.3.3 1.7 1.7 3.2 2.2M8.4 26.1s.2 2.7 1.2 3.7"/> <path class="dw5" d="M8.9 33.5s4.9-1.3 3.8-6.8c0 0-2.8-12 8.6-13.2h1.5c1.6.2 6.2 1.2 8.4 6.6.1.3.3.5.4.8.9 1 3.5 4 5.2 4.5"/> <path class="dw5" d="M11 36.9s5.6-1.1 5.2-7.4c0-.2-.1-.4-.1-.6-.5-1.4-2.3-7.7 2-10.8 2.2-1.5 5.2-1.6 7.4-.1 1.3.8 2.5 2.1 2.9 4 .1.5.3.9.6 1.3 1.4 1.7 5 6 7.1 6.4"/> <path class="dw5" d="M30.1 37.4l-9.4-8.5-.1-.1c-.4-.4-3.5-4.2-2-6.9 1-1.8 3.6-2.1 5.2-.7.7.7 1.5 1.8 2.1 3.5 0 .1.1.2.2.3.9.9 6.4 6.5 8.5 7.4M21.8 23.5s2.3 6.4 11 11.6M14.5 40.1s5.2-1.9 4.1-9M19.6 42.1s0-1.9.5-3.6c.5-1.5 2.2-2.3 3.7-1.7 1.2.5 2.7 1.5 3.2 3.5"/> </g> </svg></div><h4>Dark web, law enforcement &amp; compliance</h4></div><div class="flex-sol-content"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" id="staffing" width="45px" height="45px"> <style>.sf0{fill:#d32f2f}.sf1{fill:none;stroke:#37474f;stroke-width:2;stroke-miterlimit:10}</style><path class="sf0" d="M21.6 23.5c-2.5 0-4.5-2-4.5-4.5s2-4.5 4.5-4.5 4.5 2 4.5 4.5-2 4.5-4.5 4.5zM14.9 37.4v-6c0-.2.1-5.6 6.5-5.6 6.3 0 6.5 5.3 6.5 5.5v6.1c-1.9.8-4.1 1.3-6.3 1.3-2.4.1-4.6-.4-6.7-1.3z"/> <path class="sf1" d="M16.8 19.7c-.8.4-1.8.7-2.8.7-3 0-5.5-2.4-5.5-5.5S11 9.5 14 9.5c2.8 0 5.1 2.1 5.4 4.9M6.4 28.8v-1.5s.1-6.5 7.4-6.5c1.3 0 2.4.2 3.3.6l.5.2M28.2 19.7c.8.5 1.8.8 2.8.8 3 0 5.5-2.4 5.5-5.5S34 9.5 31 9.5c-2.8 0-5.1 2.1-5.4 4.9M38.6 28.8v-1.5s-.1-6.5-7.4-6.5c-1.3 0-2.4.2-3.3.6l-.5.2"/> <circle class="sf1" cx="22.5" cy="22.5" r="17.3"/> <circle class="sf1" cx="22.5" cy="19.1" r="5.5"/> <path class="sf1" d="M29.8 37.8v-6.4s-.1-6.5-7.4-6.5-7.4 6.5-7.4 6.5v6.4"/> </svg></div><h4>Staffing, talent sourcing &amp; job market research</h4></div><div class="flex-sol-content"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" id="e-commerce" width="45px" height="45px"> <style>.ec0{fill:#d32f2f}.ec1{fill:none;stroke:#37474f;stroke-width:2;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10}</style><circle class="ec0" cx="17.1" cy="37.8" r="1.8"/> <circle class="ec0" cx="30.7" cy="37.8" r="1.8"/> <path class="ec0" d="M15.2 23.4h5.1v3.7h-3.9zM22.1 18h6.8v3.6h-6.8zM22.1 12.4h6.8v3.9h-6.8zM35.8 21.7h-5.1V18h6.4zM22.1 23.4h6.8v3.7h-6.8zM20.3 21.7h-5.6L13.5 18h6.8zM33.9 27.1h-3.2v-3.7h4.5zM37.7 16.3h-7v-3.9H39zM20.3 12.4v3.9h-7.4l-1.2-3.9z"/> <path class="ec1" d="M3.8 5.4l6.4 2.9 6.5 20.3h18.8l5.7-16.5H11.7M22.2 12.4v16.1M30.7 12.4v16.1M39 17.8H13.2M37.2 23.2H15.1"/> <path class="ec1" d="M16.3 28.3l-2.4 5.8h22"/> <circle class="ec1" cx="18.1" cy="37.8" r="2.7"/> <circle class="ec1" cx="31.7" cy="37.8" r="2.7"/> </svg></div><h4>Product aggregation &amp; price monitoring for retail, e-commerce &amp; manufacturers</h4></div><div class="flex-sol-content"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" id="ratings" width="45px" height="45px"> <style>.rt2{fill:#37474f;stroke:#37474f;stroke-width:.5;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10}</style><path d="M13.4 13.4l3.7 3.6c.1.1.2.3.2.5l-.9 5.1 4.6-2.4c.2-.1.4-.1.5 0l4.6 2.4-.9-5.1c0-.2 0-.4.2-.5l3.7-3.6-5.1-.7c-.2 0-.4-.1-.4-.3l-2.3-4.6-2.3 4.6c-.1.2-.2.3-.4.3l-5.2.7z" fill="#d32f2f"/> <path fill="none" stroke="#37474f" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" stroke-miterlimit="10" d="M22.5 6.6l2.9 6 6.6.9-4.8 4.6 1.1 6.5-5.8-3-5.8 3 1.1-6.5-4.8-4.6 6.6-.9z"/> <path class="rt2" d="M10 16.6l1.4 2.9 3.2.5-2.3 2.3.6 3.2L10 24l-2.9 1.5.6-3.2L5.4 20l3.2-.5zM5.9 26.1L7.3 29l3.2.5-2.3 2.3.6 3.1-2.9-1.5L3 34.9l.6-3.1-2.3-2.3 3.2-.5zM35 16.6l-1.4 2.9-3.2.5 2.3 2.3-.6 3.2L35 24l2.9 1.5-.6-3.2 2.3-2.3-3.2-.5zM39.1 26.1L37.7 29l-3.2.5 2.3 2.3-.6 3.1 2.9-1.5 2.9 1.5-.6-3.1 2.3-2.3-3.2-.5z"/> </svg></div><h4>Monitoring of ratings and reviews, sentiment analysis &amp; social network intelligence</h4></div></div></div></div></div></div><div class="page-enterprise"><div class="content-container advise-container"><div class="solutions-row"><div class="flex-container"><div class="flip-container flex-item cta-link partner"><div class="flipper"><div class="front"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="75" height="75"> <path fill="#d32f2f" d="M50.709 10.941c-3.534-3.534-8.227-5.48-13.209-5.48-.705 0-1.419.04-2.124.117-8.62.953-15.541 7.906-16.457 16.532-.624 5.861 1.454 11.538 5.697 15.577 2.991 2.844 4.706 6.903 4.706 11.135v15.76c-.001.248.2.451.448.452H32.147v4.052c0 .249.202.451.452.451h9.721c.248 0 .45-.202.451-.451v-4.052h2.455c.248.001.449-.2.45-.449V48.827c0-4.207 1.742-8.231 4.904-11.331 3.6-3.505 5.623-8.321 5.606-13.345.003-4.984-1.942-9.675-5.477-13.21zM26.312 35.908c-3.688-3.508-5.491-8.442-4.948-13.538.796-7.488 6.804-13.521 14.284-14.349.184-.021.372-.017.557-.031v6.126c-.807-.391-1.692-.593-2.589-.591h-.002c-3.277 0-5.943 2.667-5.943 5.944s2.666 5.943 5.944 5.943c.903 0 1.789-.207 2.59-.594l.003 2.851-.001 2.072c-.003.498.398.904.896.907.235.001.462-.089.631-.252.77-.743 1.798-1.157 2.869-1.155 2.281 0 4.14 1.857 4.14 4.14 0 2.283-1.856 4.14-4.14 4.14-.977-.001-1.923-.347-2.669-.978-.12-.102-.265-.17-.42-.198l-.24-.044c-.491-.089-.961.236-1.05.727-.01.053-.015.107-.015.161V53.24c0 .049.021.092.028.139H31.78v-4.555c0-4.901-1.993-9.61-5.468-12.916zm14.002 31.173h-5.708v-2.949h5.708v2.949zm2.907-4.502H31.78v-3.397h11.441v3.397zm0-4.302H31.78v-3.346h11.441v3.346zm5.642-22.536c-3.639 3.563-5.643 8.211-5.643 13.084v4.553h-5.236c.007-.047.029-.09.029-.139v-14.51c.807.392 1.692.596 2.59.597 3.276 0 5.943-2.667 5.943-5.944-.004-3.287-2.672-5.947-5.959-5.942-.892.001-1.771.203-2.574.589l-.004-4.75c0-.499-.405-.902-.904-.901-.054 0-.107.005-.161.015l-.24.044c-.155.028-.3.097-.42.198-.748.63-1.693.976-2.67.977-2.285-.002-4.137-1.854-4.139-4.139.002-2.285 1.854-4.136 4.139-4.139h.001c1.069-.002 2.098.412 2.868 1.155.003.499.41.901.908.898.499-.003.901-.41.898-.908-.001-.244-.102-.478-.278-.646l-.001-7.889c8.713.272 15.725 7.428 15.725 16.207-.001 4.396-1.732 8.513-4.872 11.59z"/> </svg></div><h3 class="underline small">Let’s Partner</h3><p>Team up with the best web scraping engineers while you stay focused on your business goals</p></div><div class="back"><p>Quality assurance, enterprise service-level agreements and maintenance plans</p><p>Full access to your project’s code with training and handover</p><p>Money-back guarantee for your project</p><p><a href="/enterprise-consultation">Get in touch</a></p></div></div></div><div class="flip-container flex-item cta-link partner"><div class="flipper"><div class="front"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="75" height="75"> <g fill="#D32F2F"> <path d="M43.967 11.123a1.081 1.081 0 0 0-1.494 0l-3.344 3.35a.4.4 0 0 1-.684-.283V4.636a.382.382 0 0 1 .044-.33 1.38 1.38 0 0 0-1.139-2.152c-.759 0-1.377.618-1.377 1.377 0 .292.096.575.276.817a.403.403 0 0 1 .079.239v9.604a.401.401 0 0 1-.683.283l-3.345-3.35c-.2-.2-.465-.311-.748-.311s-.548.11-.748.311a1.063 1.063 0 0 0 0 1.501l5.833 5.842c.397.401 1.099.4 1.496.001l5.833-5.843a1.063 1.063 0 0 0 .001-1.502zM23.96 11.123a1.084 1.084 0 0 0-1.496 0l-3.28 3.285a.4.4 0 1 1-.682-.304c.234-4.21 1.179-7.339 2.89-9.565.292-.298.434-.643.434-1.008a1.38 1.38 0 0 0-1.378-1.377 1.38 1.38 0 0 0-1.29.912c-1.54 2.465-2.536 6.524-2.77 11.214a.402.402 0 0 1-.256.354.403.403 0 0 1-.427-.091l-3.413-3.418c-.2-.2-.465-.311-.748-.311s-.548.11-.749.311c-.2.2-.31.467-.31.75s.11.55.311.75l5.833 5.842c.398.401 1.1.4 1.495.001l5.834-5.843a1.062 1.062 0 0 0 .002-1.502zM64.286 11.874c0-.284-.11-.55-.311-.75-.399-.4-1.097-.4-1.495 0l-3.413 3.418a.4.4 0 0 1-.683-.263c-.234-4.69-1.229-8.749-2.732-11.137-.234-.622-.753-.988-1.326-.988a1.38 1.38 0 0 0-1.379 1.377c0 .365.143.71.4.97 1.744 2.265 2.689 5.394 2.924 9.604a.402.402 0 0 1-.236.387.405.405 0 0 1-.447-.083l-3.279-3.285a1.05 1.05 0 0 0-.748-.311c-.282 0-.548.11-.748.311a1.065 1.065 0 0 0 0 1.501l5.834 5.842c.397.401 1.1.4 1.494.001l5.835-5.843c.2-.201.31-.467.31-.751z"/> </g> <path fill="#D32F2F" d="M17.37 38.776v26.293c0 4.423 9.625 6.668 18.568 6.882l2.902-.001c8.941-.214 18.563-2.458 18.563-6.881V38.776c0-4.134-8.602-6.35-17.206-6.729-.271.511-.418 1.155-.453 1.989.326.014.646.031.965.049l.832-.834c.855-.854 2.482-.855 3.337 0 .362.364.593.827.663 1.328 5.666.866 9.745 2.611 9.745 4.186 0 1.947-6.973 4.775-17.899 4.775s-17.9-2.828-17.9-4.775c0-1.572 4.07-3.315 9.724-4.182.071-.502.301-.966.665-1.331.852-.856 2.48-.857 3.336 0l.833.835c.318-.019.638-.036.964-.05-.035-.833-.182-1.477-.453-1.988-8.595.38-17.186 2.597-17.186 6.727zm37.916 26.239c0 1.947-6.973 4.777-17.899 4.777s-17.9-2.83-17.9-4.777V60.2c0-.144.077-.276.201-.348s.278-.07.401.003c3.467 2.037 10.095 3.302 17.297 3.302 7.203 0 13.83-1.265 17.297-3.302.123-.072.277-.074.401-.003s.201.204.201.348v4.815zm0-8.75c0 1.947-6.973 4.775-17.899 4.775s-17.9-2.828-17.9-4.775V51.45a.4.4 0 0 1 .602-.345c3.466 2.035 10.095 3.3 17.297 3.3 7.203 0 13.83-1.265 17.297-3.3a.398.398 0 1 1 .602.345v4.815zM19.688 42.354a.394.394 0 0 1 .401.002c3.468 2.036 10.096 3.301 17.297 3.301 7.202 0 13.83-1.265 17.297-3.301a.398.398 0 1 1 .602.345v4.814c0 1.947-6.973 4.775-17.899 4.775s-17.9-2.828-17.9-4.775V42.7a.399.399 0 0 1 .202-.346z"/> <path fill="#D32F2F" d="M56.377 22.949c0-.712-.58-1.292-1.292-1.292-.606 0-1.138.431-1.263 1.023a.301.301 0 0 1-.244.234c-2.327 1.886-5.258 2.49-8.094 3.074-2.18.451-4.649.962-6.476 2.267a.3.3 0 0 1-.474-.245v-5.27a.269.269 0 0 1-.016-.14c.012-.064.021-.127.021-.194 0-.639-.521-1.159-1.159-1.159a1.16 1.16 0 0 0-1.159 1.159l.012.104a.284.284 0 0 1-.017.126v5.374a.3.3 0 0 1-.475.245c-1.826-1.304-4.295-1.815-6.475-2.267-2.854-.588-5.805-1.196-8.139-3.11l-.007-.004a.307.307 0 0 1-.173-.201 1.298 1.298 0 0 0-1.26-1.017c-.712 0-1.292.58-1.292 1.292 0 .301.107.587.309.827.01.011.018.023.026.035.097.101.162.158.23.205 2.954 2.826 6.772 3.615 9.84 4.249 4.873 1.005 7.416 1.737 7.416 6.185v2.788a.299.299 0 0 1-.185.277.302.302 0 0 1-.327-.065l-3.344-3.35a1.15 1.15 0 0 0-.818-.34c-.31 0-.6.121-.818.34a1.165 1.165 0 0 0 0 1.643l5.833 5.842c.435.439 1.203.438 1.637.001l5.833-5.843a1.166 1.166 0 0 0-.001-1.643c-.436-.438-1.199-.437-1.637 0l-3.344 3.35a.3.3 0 0 1-.512-.212v-2.788c0-4.447 2.543-5.179 7.416-6.185 3.055-.631 6.857-1.417 9.771-4.188.429-.276.657-.684.657-1.127z"/> </svg></div><h3 class="underline small">Data on Demand</h3><p>Any size scraping project. Data refreshed regularly, reliably and in the form you want</p></div><div class="back"><p>Accuracy and coverage guarantees</p><p>Scraped data from virtually any number of web pages</p><p>Post processing and automated data crawling updates anytime</p><p><a href="/enterprise-consultation">Get in touch</a></p></div></div></div><div class="flip-container flex-item cta-link partner"><div class="flipper"><div class="front"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" x="0px" y="0px" width="75px" height="75px" viewBox="0 0 75 75" enable-background="new 0 0 75 75"> <path fill="#D32F2F" d="M58.468,37.261c8.489-7.463,11.447-14.018,9.437-17.501c-0.928-1.607-2.967-2.615-5.93-2.889  c-0.439-2.714-2.795-4.795-5.631-4.795c-3.148,0-5.711,2.562-5.711,5.711c0,0.195,0.01,0.387,0.029,0.577  c-0.854,0.246-1.725,0.51-2.633,0.817C45.812,8.098,41.617,2.26,37.593,2.26c-4.023,0-8.221,5.838-10.437,16.923  C16.449,15.56,9.294,16.274,7.282,19.76c-0.796,1.378-0.798,3.244-0.004,5.448c-1.014,1.031-1.641,2.442-1.641,3.999  c0,3.149,2.561,5.711,5.71,5.711c0.842,0,1.639-0.188,2.359-0.517c0.92,0.937,1.905,1.887,3.01,2.86  C8.226,44.723,5.27,51.277,7.28,54.759c2.013,3.486,9.167,4.201,19.874,0.58c0.302,1.513,0.642,2.925,1.013,4.24  c-1.411,1.041-2.332,2.71-2.332,4.594c0,3.147,2.561,5.709,5.71,5.709c0.5,0,0.982-0.071,1.445-0.192  c1.477,1.699,3.052,2.572,4.601,2.572c4.024,0,8.222-5.838,10.438-16.922c10.707,3.621,17.861,2.906,19.873-0.58  C69.915,51.275,66.957,44.723,58.468,37.261z M7.601,29.206c0-2.064,1.681-3.745,3.746-3.745s3.746,1.681,3.746,3.745  c0,2.065-1.681,3.746-3.746,3.746C9.281,32.952,7.601,31.271,7.601,29.206z M9.177,53.667c-1.542-2.67,1.916-8.492,9.283-14.926  c2.133,1.752,4.566,3.545,7.301,5.35c0.197,3.271,0.533,6.275,0.984,8.998C17.489,56.251,10.718,56.333,9.177,53.667z   M25.631,41.333c-1.964-1.36-3.773-2.728-5.419-4.072c1.645-1.35,3.455-2.713,5.419-4.074c-0.046,1.318-0.068,2.676-0.068,4.074  C25.563,38.656,25.585,40.016,25.631,41.333z M25.761,30.428c-2.735,1.807-5.168,3.6-7.301,5.354  c-1.06-0.927-2.012-1.833-2.907-2.725c0.931-1.017,1.504-2.366,1.504-3.85c0-3.148-2.562-5.71-5.71-5.71  c-0.8,0-1.562,0.167-2.254,0.466c-0.383-1.265-0.37-2.324,0.083-3.108c1.541-2.67,8.312-2.588,17.568,0.575  C26.294,24.153,25.958,27.155,25.761,30.428z M47.102,28.937c-1.117-0.698-2.283-1.396-3.494-2.096  c-1.209-0.699-2.396-1.357-3.561-1.977c2.16-1.021,4.248-1.906,6.236-2.658C46.627,24.306,46.904,26.556,47.102,28.937z   M37.593,4.448c3.081,0,6.396,5.903,8.284,15.502c-2.582,0.971-5.352,2.182-8.284,3.646c-2.932-1.465-5.702-2.676-8.284-3.646  C31.197,10.352,34.511,4.448,37.593,4.448z M28.902,22.206c1.99,0.752,4.077,1.637,6.237,2.658  c-1.164,0.619-2.351,1.277-3.562,1.977c-1.21,0.699-2.374,1.397-3.492,2.096C28.281,26.556,28.558,24.306,28.902,22.206z   M28.084,45.583c1.119,0.697,2.283,1.397,3.493,2.097c1.211,0.698,2.397,1.356,3.562,1.979c-2.16,1.019-4.247,1.901-6.237,2.653  C28.558,50.213,28.281,47.963,28.084,45.583z M27.8,64.173c0-2.065,1.681-3.746,3.746-3.746c2.066,0,3.746,1.679,3.746,3.746  c0,2.063-1.681,3.744-3.746,3.744S27.8,66.236,27.8,64.173z M37.592,70.072c-0.865,0-1.749-0.467-2.614-1.348  c1.381-1.043,2.278-2.692,2.278-4.553c0-3.148-2.562-5.711-5.71-5.711c-0.447,0-0.881,0.057-1.299,0.155  c-0.341-1.263-0.657-2.61-0.938-4.048c2.583-0.971,5.352-2.182,8.284-3.647c2.935,1.468,5.702,2.679,8.284,3.647  C43.986,64.168,40.673,70.072,37.592,70.072z M46.284,52.312c-1.989-0.752-4.077-1.637-6.237-2.655  c1.163-0.621,2.352-1.279,3.561-1.979c1.211-0.697,2.377-1.396,3.494-2.096C46.904,47.963,46.627,50.213,46.284,52.312z   M47.287,42.859c-1.514,0.982-3.104,1.963-4.771,2.926c-1.67,0.963-3.312,1.854-4.923,2.668c-1.611-0.814-3.253-1.705-4.922-2.668  c-1.668-0.963-3.259-1.941-4.771-2.926c-0.098-1.805-0.151-3.672-0.151-5.6s0.053-3.795,0.15-5.598  c1.511-0.985,3.104-1.965,4.771-2.928c1.669-0.963,3.311-1.852,4.922-2.669c1.61,0.817,3.254,1.706,4.924,2.669  c1.668,0.963,3.258,1.942,4.771,2.928c0.1,1.803,0.148,3.67,0.148,5.598S47.384,41.055,47.287,42.859z M56.344,14.04  c2.064,0,3.745,1.68,3.745,3.746c0,2.064-1.681,3.745-3.745,3.745c-2.066,0-3.746-1.681-3.746-3.745  C52.598,15.72,54.277,14.04,56.344,14.04z M51.335,20.524c0.972,1.769,2.853,2.972,5.009,2.972c2.729,0,5.014-1.925,5.576-4.487  c2.057,0.179,3.482,0.795,4.089,1.845c1.54,2.668-1.916,8.49-9.282,14.926c-2.133-1.752-4.564-3.545-7.303-5.354  c-0.197-3.271-0.533-6.275-0.982-8.999C49.438,21.088,50.398,20.794,51.335,20.524z M49.555,33.187  c1.965,1.361,3.773,2.724,5.42,4.074c-1.646,1.347-3.455,2.712-5.42,4.072c0.047-1.317,0.069-2.678,0.069-4.072  C49.624,35.863,49.602,34.505,49.555,33.187z M66.009,53.667c-1.541,2.666-8.312,2.584-17.567-0.578  c0.449-2.723,0.785-5.725,0.982-8.998c2.736-1.805,5.17-3.598,7.303-5.35C64.093,45.175,67.549,50.997,66.009,53.667z"/> <circle fill="#D32F2F" cx="37.592" cy="37.26" r="6.562"/> </svg></div><h3 class="underline small">Data Science</h3><p>Enriched data for your business that goes beyond traditional web crawling needs</p></div><div class="back"><p>Your raw web data post-processed for real insights</p><p>Link data across disparate scraped pages</p><p>Deduce sentiment on a large scale</p><p><a href="/enterprise-consultation">Get in touch</a></p></div></div></div><div class="flip-container flex-item cta-link partner"><div class="flipper"><div class="front"><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="75" height="75"> <g fill="#D32F2F"> <path d="M57.777 31.222c.23-4.129-.616-7.256-2.519-9.299a7.204 7.204 0 0 0-3.999-2.192c-1.346-2.671-3.545-3.277-4.984-3.206-2.6-2.352-5.018-2.4-6.622-1.998a7.098 7.098 0 0 0-2.204.986 7.104 7.104 0 0 0-2.203-.986c-1.605-.402-4.022-.354-6.622 1.998-1.447-.073-3.639.535-4.985 3.206a7.221 7.221 0 0 0-3.998 2.192c-1.902 2.043-2.749 5.17-2.518 9.299-1.658 2.771-3.012 8.334 2.189 12.297-.348 2.465.124 6.43 3.765 8.521-.138.945-.155 2.373.609 3.641.641 1.066 1.674 1.777 3.076 2.119.834 1.271 3.227 4.438 6.316 4.438h.064c1.598-.023 3.042-.877 4.306-2.541 1.265 1.664 2.708 2.518 4.307 2.541h.064c3.09 0 5.481-3.166 6.316-4.438 1.401-.342 2.435-1.053 3.077-2.119.763-1.268.746-2.695.607-3.641 3.641-2.09 4.113-6.057 3.765-8.521 5.205-3.963 3.851-9.526 2.193-12.297zM33.111 59.821h-.032c-1.925 0-3.918-2.672-4.525-3.707a1.204 1.204 0 0 0-.843-.582c-.965-.162-1.603-.518-1.951-1.088-.547-.898-.296-2.244-.156-2.684a1.205 1.205 0 0 0-.669-1.49c-2.76-1.183-3.284-3.626-3.292-5.289.79.358 1.566.561 2.322.561.447 0 .887-.064 1.319-.189 1.31-.381 2.275-1.286 2.944-2.164a6.843 6.843 0 0 0 1.667 1.564 1.21 1.21 0 0 0 1.675-.336 1.21 1.21 0 0 0-.335-1.674c-3.625-2.414-1.223-7.398-1.116-7.615.012-.025.01-.052.021-.077 1.139-.606 1.957-1.244 2.598-1.746.373-.293.884-.693 1.042-.714 0 0 .019.004.062.024a1.208 1.208 0 1 0 1.079-2.159c-1.501-.749-2.655.152-3.671.947-.809.633-1.814 1.421-3.422 2.065-2.92 1.166-5.373-1.793-5.482-1.928a1.207 1.207 0 0 0-1.882 1.511c.122.15 2.537 3.111 5.958 3.111.255 0 .517-.023.783-.059-.35 1.334-.508 2.913-.18 4.441-.283.606-1.097 2.104-2.414 2.487-.92.27-2.007-.072-3.236-1.01-.008-.006-.018-.007-.026-.013-.013-.01-.021-.023-.035-.032-5.665-3.777-2.35-9.176-1.955-9.775a1.22 1.22 0 0 0 .2-.756c-.254-3.553.375-6.266 1.817-7.845 1.127-1.235 2.439-1.481 2.903-1.532 3.425 1.249 3.549 3.985 3.553 4.131a1.208 1.208 0 0 0 2.414-.01c0-.174-.065-3.928-4.107-5.944 1.072-1.546 2.444-1.317 2.609-1.283.411.103.846-.016 1.146-.317 2.985-2.986 5.428-1.727 6.333-1.056V57.16c-1.002 1.744-2.049 2.647-3.116 2.661zm16.855-9.549a1.205 1.205 0 0 0-.669 1.49c.004.016.49 1.609-.151 2.676-.347.574-.987.934-1.957 1.096a1.202 1.202 0 0 0-.841.582c-.608 1.035-2.602 3.707-4.526 3.707h-.033c-1.079-.017-2.137-.937-3.147-2.717v-39.49c.885-.668 3.348-1.988 6.367 1.032.29.291.728.409 1.129.322.179-.039 1.546-.267 2.618 1.266-4.068 2.013-4.134 5.781-4.134 5.956 0 .661.535 1.193 1.195 1.201h.014c.656 0 1.191-.532 1.205-1.188.002-.121.11-2.883 3.562-4.137.433.045 1.758.278 2.895 1.496 1.466 1.576 2.106 4.304 1.851 7.883-.019.265.052.531.197.752.398.603 3.707 5.996-1.94 9.773-.034.021-.073.026-.105.051-1.226.938-2.312 1.275-3.228 1.012-1.378-.395-2.303-2.125-2.541-2.762a1.205 1.205 0 0 0-1.55-.711c-.625.23-.945.924-.714 1.549.051.141 1.29 3.418 4.124 4.24.433.125.872.189 1.317.189.766 0 1.553-.208 2.354-.577-.006 1.667-.524 4.122-3.292 5.309z"/> <path d="M31.238 47.474c-3.497 3.498-2.646 7.1-2.607 7.254a1.197 1.197 0 0 0 1.447.873 1.213 1.213 0 0 0 .897-1.445c-.022-.1-.521-2.484 1.969-4.975a1.206 1.206 0 1 0-1.706-1.707zM40.84 44.101a1.208 1.208 0 0 0 0 1.707c2.477 2.477 1.997 4.848 1.97 4.973a1.205 1.205 0 0 0 1.175 1.488c.541 0 1.033-.367 1.17-.914.037-.152.889-3.756-2.607-7.254a1.209 1.209 0 0 0-1.708 0zM42.178 29.606l.057.008c.137.113.363.714.529 1.155.394 1.046.934 2.478 2.182 4.14 1.518 2.018 3.624 2.523 5.354 2.523a9.799 9.799 0 0 0 3.086-.506 1.208 1.208 0 0 0-.826-2.27c-.152.055-3.792 1.318-5.685-1.199-1.04-1.385-1.491-2.58-1.853-3.541-.455-1.209-.973-2.578-2.646-2.715a1.2 1.2 0 0 0-1.302 1.104 1.203 1.203 0 0 0 1.104 1.301zM37.424 10.716c.666 0 1.207-.541 1.207-1.207V3.474a1.208 1.208 0 0 0-2.414 0v6.035c0 .666.541 1.207 1.207 1.207zM37.424 63.819c-.666 0-1.207.541-1.207 1.207v6.035a1.208 1.208 0 0 0 2.414 0v-6.035c0-.666-.541-1.207-1.207-1.207zM9.666 36.06H3.631a1.208 1.208 0 0 0 0 2.414h6.035a1.207 1.207 0 0 0 0-2.414zM71.218 36.06h-6.036a1.208 1.208 0 0 0 0 2.414h6.036a1.208 1.208 0 0 0 0-2.414zM16.943 18.492a1.204 1.204 0 0 0 1.706 0 1.207 1.207 0 0 0 0-1.708l-4.267-4.266a1.208 1.208 0 0 0-1.707 1.707l4.268 4.267zM57.906 56.044a1.202 1.202 0 0 0-1.705 0 1.204 1.204 0 0 0 0 1.707l4.266 4.264c.234.236.544.354.853.354s.618-.117.854-.354c.47-.471.47-1.232 0-1.705l-4.268-4.266zM57.053 18.846c.309 0 .617-.118.854-.354l4.266-4.267a1.206 1.206 0 1 0-1.705-1.707l-4.266 4.266a1.205 1.205 0 0 0 0 1.708c.234.236.542.354.851.354zM16.942 56.044l-4.267 4.266a1.205 1.205 0 0 0 .854 2.06c.309 0 .617-.119.853-.355l4.267-4.264a1.206 1.206 0 1 0-1.707-1.707z"/> </g> </svg></div><h3 class="underline small">Training</h3><p>Learn from the recognised experts in data crawling and scraping to grow your own in-house team</p></div><div class="back"><p>One-to-one and group training</p><p>Standard introduction to web scraping</p><p>Tailored courses to help you solve very specific business challenges</p><p><a href="/enterprise-consultation">Get in touch</a></p></div></div></div></div></div><p><a href="/enterprise-consultation">Need some advice?</a></p></div></div><div class="page-enterprise"> <section class="experts-section parallax-wrapper"><div class="wrap container-fluid experts-wrap"><div class="col-md-12 col-xs-12"><div class="row usp-row"><div class="col-md-4 col-xs-12 usp-col"><h2>The best web crawler team</h2><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="75" height="75"> <path fill="#FFF" d="M37.5 2.5a6.78 6.78 0 0 0-6.774 6.774 6.78 6.78 0 0 0 6.774 6.774 6.78 6.78 0 0 0 6.774-6.774A6.78 6.78 0 0 0 37.5 2.5zm0 12.035c-2.901 0-5.261-2.36-5.261-5.261s2.36-5.261 5.261-5.261c2.9 0 5.261 2.36 5.261 5.261S40.4 14.535 37.5 14.535z"/> <path fill="#FFF" d="M51.613 2.5a5.087 5.087 0 0 0-5.081 5.081v1.693c0 4.98-4.053 9.032-9.032 9.032-4.98 0-9.032-4.052-9.032-9.032V7.581A5.087 5.087 0 0 0 23.386 2.5a5.086 5.086 0 0 0-5.08 5.081c0 5.032 1.96 9.766 5.52 13.326L37.5 34.581l13.674-13.674a18.723 18.723 0 0 0 5.52-13.326A5.088 5.088 0 0 0 51.613 2.5zm-1.509 17.336L37.5 32.44 24.896 19.836A17.22 17.22 0 0 1 19.82 7.581a3.57 3.57 0 0 1 3.566-3.568 3.572 3.572 0 0 1 3.568 3.568v1.693c0 5.815 4.731 10.547 10.546 10.547 5.815 0 10.546-4.731 10.546-10.547V7.581c0-1.967 1.601-3.568 3.567-3.568s3.566 1.601 3.566 3.568a17.216 17.216 0 0 1-5.075 12.255zM9.274 30.726A6.78 6.78 0 0 0 2.5 37.5a6.78 6.78 0 0 0 6.774 6.774 6.78 6.78 0 0 0 6.774-6.774 6.78 6.78 0 0 0-6.774-6.774zm0 12.035A5.267 5.267 0 0 1 4.013 37.5c0-2.901 2.36-5.261 5.261-5.261s5.261 2.36 5.261 5.261c0 2.9-2.36 5.261-5.261 5.261z"/> <path fill="#FFF" d="M20.906 23.827a18.72 18.72 0 0 0-13.325-5.521A5.086 5.086 0 0 0 2.5 23.387a5.087 5.087 0 0 0 5.081 5.081h1.693c4.98 0 9.032 4.052 9.032 9.032 0 4.979-4.052 9.032-9.032 9.032H7.581c-2.802 0-5.081 2.28-5.081 5.081s2.279 5.08 5.081 5.08a18.72 18.72 0 0 0 13.325-5.52L34.581 37.5 20.906 23.827zm-1.07 26.277A17.22 17.22 0 0 1 7.581 55.18c-1.967 0-3.568-1.6-3.568-3.566s1.601-3.567 3.568-3.567h1.693c5.815 0 10.546-4.73 10.546-10.546 0-5.814-4.731-10.546-10.546-10.546H7.581a3.572 3.572 0 0 1-3.568-3.567 3.57 3.57 0 0 1 3.568-3.566c4.629 0 8.982 1.802 12.255 5.075L32.44 37.5 19.836 50.104zM37.5 58.951c-3.737 0-6.774 3.038-6.774 6.774S33.763 72.5 37.5 72.5c3.736 0 6.774-3.038 6.774-6.774s-3.038-6.775-6.774-6.775zm0 12.035c-2.901 0-5.261-2.36-5.261-5.261s2.36-5.261 5.261-5.261c2.9 0 5.261 2.36 5.261 5.261S40.4 70.986 37.5 70.986z"/> <path fill="#FFF" d="M51.174 54.094L37.5 40.419 23.826 54.094a18.72 18.72 0 0 0-5.52 13.324 5.086 5.086 0 0 0 5.08 5.082 5.087 5.087 0 0 0 5.082-5.082v-1.692c0-4.979 4.052-9.032 9.032-9.032 4.979 0 9.032 4.053 9.032 9.032v1.692c0 2.803 2.28 5.082 5.081 5.082s5.08-2.279 5.08-5.082c0-5.032-1.96-9.765-5.519-13.324zm.439 16.892a3.572 3.572 0 0 1-3.567-3.568v-1.692c0-5.815-4.73-10.546-10.546-10.546-5.814 0-10.546 4.73-10.546 10.546v1.692c0 1.968-1.6 3.568-3.568 3.568a3.57 3.57 0 0 1-3.566-3.568c0-4.63 1.802-8.981 5.075-12.255L37.5 42.559l12.604 12.604a17.217 17.217 0 0 1 5.076 12.255c0 1.968-1.6 3.568-3.567 3.568zM65.726 30.726a6.78 6.78 0 0 0-6.774 6.774c0 3.736 3.038 6.774 6.774 6.774S72.5 41.236 72.5 37.5a6.78 6.78 0 0 0-6.774-6.774zm0 12.035c-2.9 0-5.261-2.36-5.261-5.261 0-2.901 2.36-5.261 5.261-5.261s5.261 2.36 5.261 5.261a5.269 5.269 0 0 1-5.261 5.261z"/> <path fill="#FFF" d="M67.418 46.532h-1.692c-4.979 0-9.032-4.053-9.032-9.032 0-4.98 4.053-9.032 9.032-9.032h1.692a5.087 5.087 0 0 0 5.082-5.081 5.087 5.087 0 0 0-5.082-5.081c-5.032 0-9.765 1.96-13.324 5.521L40.419 37.5l13.675 13.674a18.72 18.72 0 0 0 13.324 5.52c2.803 0 5.082-2.279 5.082-5.08s-2.279-5.082-5.082-5.082zm0 8.648a17.22 17.22 0 0 1-12.255-5.076L42.559 37.5l12.604-12.604a17.215 17.215 0 0 1 12.255-5.075 3.57 3.57 0 0 1 3.568 3.566 3.572 3.572 0 0 1-3.568 3.567h-1.692c-5.815 0-10.546 4.731-10.546 10.546 0 5.815 4.73 10.546 10.546 10.546h1.692c1.968 0 3.568 1.601 3.568 3.567s-1.6 3.567-3.568 3.567z"/> </svg></div><div class="usp-content"><p>Authors of the #1 web crawling framework, the world’s most experienced team of engineers will help you get the very best results for your project.</p></div></div><div class="col-md-4 col-xs-12 usp-col"><h2>7 billion pages crawled on our platform <br/>per month</h2><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="75" height="75"> <g fill="#FFF"> <path d="M69.603 40.472a4.255 4.255 0 0 0-4.116 3.215h-10.07a19.76 19.76 0 0 0 1.003-6.186c0-2.512-.466-4.963-1.386-7.292h9.93c.568 0 1.03-.463 1.03-1.032v-7.428a4.253 4.253 0 0 0 3.215-4.116 4.25 4.25 0 0 0-4.245-4.244 4.249 4.249 0 0 0-4.244 4.244 4.254 4.254 0 0 0 3.212 4.116v6.397h-9.859a20.093 20.093 0 0 0-9.659-8.941V8.392a4.248 4.248 0 0 0 3.214-4.114A4.25 4.25 0 0 0 43.383.034a4.249 4.249 0 0 0-4.244 4.244 4.249 4.249 0 0 0 3.212 4.114v10.059a19.824 19.824 0 0 0-5.874-.895c-2.621 0-5.176.51-7.602 1.516V7.596c0-.569-.462-1.031-1.031-1.031H12.429a4.257 4.257 0 0 0-4.117-3.213 4.249 4.249 0 0 0-4.244 4.244 4.25 4.25 0 0 0 4.244 4.245 4.256 4.256 0 0 0 4.117-3.214h14.382V20.06a19.97 19.97 0 0 0-7.945 8.086H9.81a4.258 4.258 0 0 0-4.117-3.213 4.249 4.249 0 0 0-4.244 4.244 4.25 4.25 0 0 0 4.244 4.245 4.258 4.258 0 0 0 4.117-3.213h8.11a19.749 19.749 0 0 0-1.39 7.292c0 2.082.337 4.161 1.003 6.186H5.397c-.568 0-1.03.463-1.03 1.031V60.78a4.252 4.252 0 0 0-3.215 4.114c0 2.341 1.904 4.246 4.245 4.246s4.245-1.905 4.245-4.246a4.249 4.249 0 0 0-3.214-4.113V45.749h11.903a20.146 20.146 0 0 0 8.48 9.194v11.664a4.249 4.249 0 0 0-3.213 4.114c0 2.34 1.904 4.244 4.244 4.244s4.244-1.904 4.244-4.244a4.247 4.247 0 0 0-3.213-4.114V55.932a19.756 19.756 0 0 0 13.476.619v10.61c0 .569.463 1.033 1.032 1.033H58.58a4.255 4.255 0 0 0 4.116 3.213 4.25 4.25 0 0 0 4.245-4.246 4.248 4.248 0 0 0-4.245-4.242 4.255 4.255 0 0 0-4.116 3.213H44.413V55.797a19.947 19.947 0 0 0 10.205-10.048h10.868a4.255 4.255 0 0 0 4.116 3.213 4.25 4.25 0 0 0 4.245-4.244 4.25 4.25 0 0 0-4.244-4.246zm0 7.052c-1.548 0-2.808-1.259-2.808-2.807s1.26-2.807 2.808-2.807 2.807 1.259 2.807 2.807-1.26 2.807-2.807 2.807zm-41.76 26.004c-1.547 0-2.807-1.259-2.807-2.807s1.259-2.807 2.807-2.807 2.808 1.259 2.808 2.807-1.26 2.807-2.808 2.807zM43.382 1.472a2.811 2.811 0 0 1 2.809 2.807 2.812 2.812 0 0 1-2.809 2.809 2.811 2.811 0 0 1-2.807-2.809 2.81 2.81 0 0 1 2.807-2.807zm10.659 36.029c0 9.687-7.88 17.566-17.565 17.566-9.687 0-17.567-7.88-17.567-17.566s7.881-17.566 17.567-17.566c9.685 0 17.565 7.88 17.565 17.566zm10.923-22.675a2.81 2.81 0 0 1 2.807 2.807c0 1.548-1.259 2.808-2.807 2.808s-2.807-1.259-2.807-2.808a2.81 2.81 0 0 1 2.807-2.807zM8.312 10.403a2.81 2.81 0 0 1-2.806-2.808 2.81 2.81 0 0 1 2.806-2.807 2.811 2.811 0 0 1 2.809 2.807 2.811 2.811 0 0 1-2.809 2.808zM5.693 31.984c-1.547 0-2.806-1.259-2.806-2.808s1.259-2.808 2.806-2.808c1.537 0 2.795 1.251 2.812 2.729l-.008.099a2.81 2.81 0 0 1-2.804 2.788zM5.397 67.7c-1.548 0-2.808-1.259-2.808-2.807s1.259-2.807 2.808-2.807 2.808 1.259 2.808 2.807S6.946 67.7 5.397 67.7zm57.299-3.346a2.81 2.81 0 0 1 2.807 2.807c0 1.549-1.259 2.809-2.807 2.809s-2.808-1.26-2.808-2.809a2.811 2.811 0 0 1 2.808-2.807z"/> <path d="M34.308 39.722v-.863a.541.541 0 0 0-.338-.506l-3.339-1.402 3.362-1.595a.553.553 0 0 0 .315-.498v-.853a.547.547 0 0 0-.26-.468.556.556 0 0 0-.536-.025l-5.5 2.742a.548.548 0 0 0-.304.493v.561c0 .219.129.417.329.504l5.498 2.415c.07.032.145.048.223.048.104 0 .207-.03.304-.091a.559.559 0 0 0 .246-.462zM39.003 32.429a.545.545 0 0 0-.453-.239h-.951a.555.555 0 0 0-.516.358l-3.12 8.369a.549.549 0 0 0 .515.743h.95a.555.555 0 0 0 .517-.36l3.12-8.367a.55.55 0 0 0-.062-.504z"/> <path d="M45.011 36.254l-5.5-2.742a.557.557 0 0 0-.533.023.545.545 0 0 0-.264.469v.853c0 .212.124.407.314.497l3.366 1.604-3.342 1.391a.55.55 0 0 0-.339.508v.863a.556.556 0 0 0 .551.553.538.538 0 0 0 .222-.047l5.5-2.416a.55.55 0 0 0 .329-.504v-.561a.55.55 0 0 0-.304-.491z"/> </g> </svg></div><div class="usp-content"><p>We are the authors of the most popular <a href="/open-source">open-source</a> web scraping tools. You can be assured that our services are the best in class.</p></div></div><div class="col-md-4 col-xs-12 usp-col"><h2>100% money-back guarantee</h2><div> <?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="75" height="75"> <path fill="#FFF" d="M37.5 11.837c-14.151 0-25.663 11.512-25.663 25.663 0 14.15 11.512 25.664 25.663 25.664S63.163 51.65 63.163 37.5c0-14.151-11.512-25.663-25.663-25.663zm0 48.913c-12.82 0-23.25-10.431-23.25-23.25 0-12.82 10.429-23.25 23.25-23.25 12.819 0 23.249 10.429 23.249 23.25 0 12.819-10.43 23.25-23.249 23.25z"/> <path fill="#FFF" d="M38.706 36.579V27.17c3.943.317 6.598 2.151 6.598 3.813a1.206 1.206 0 1 0 2.412 0c0-3.267-3.869-5.876-9.01-6.232V23.34a1.206 1.206 0 1 0-2.413 0v1.411c-5.14.356-9.01 2.965-9.01 6.232 0 4.377 4.162 6.085 9.01 7.438v9.407c-3.942-.317-6.597-2.151-6.597-3.812a1.206 1.206 0 1 0-2.414 0c0 3.267 3.87 5.876 9.01 6.231v1.411a1.206 1.206 0 1 0 2.413 0v-1.411c5.141-.355 9.01-2.965 9.01-6.231.001-4.377-4.161-6.085-9.009-7.437zm-2.413-.671c-4.895-1.446-6.597-2.759-6.597-4.924 0-1.663 2.654-3.496 6.597-3.813v8.737zm2.413 11.921v-8.736c4.896 1.445 6.598 2.759 6.598 4.924 0 1.661-2.655 3.495-6.598 3.812z"/> <path fill="#FFF" d="M63.837 11.162C56.804 4.126 47.45.252 37.5.252c-10.71 0-20.712 4.507-27.784 12.44l.66-4.626a1.207 1.207 0 1 0-2.389-.342L6.7 16.734a1.21 1.21 0 0 0 1.576 1.316l7.723-2.575a1.206 1.206 0 1 0-.762-2.289l-3.88 1.294C17.979 6.948 27.402 2.666 37.5 2.666c9.304 0 18.052 3.623 24.632 10.203 6.58 6.58 10.203 15.327 10.203 24.631s-3.623 18.053-10.203 24.632S46.806 72.334 37.5 72.334a1.207 1.207 0 0 0 0 2.414c9.95 0 19.304-3.875 26.337-10.91 7.037-7.035 10.91-16.389 10.91-26.338 0-9.949-3.873-19.303-10.91-26.338zM32.522 71.981a35.73 35.73 0 0 1-1.229-.198 1.206 1.206 0 1 0-.426 2.375c.433.079.874.15 1.312.213a1.208 1.208 0 0 0 .343-2.39zM26.625 70.604c-.76-.25-1.52-.529-2.262-.831a1.208 1.208 0 0 0-.911 2.235c.793.323 1.606.621 2.419.889a1.206 1.206 0 0 0 .754-2.293zM3.467 44.963a35.236 35.236 0 0 1-.432-2.37 1.208 1.208 0 0 0-2.388.349c.124.847.279 1.699.461 2.534a1.208 1.208 0 0 0 2.359-.513zM3.061 26.775a1.207 1.207 0 0 0-1.483.844 37.102 37.102 0 0 0-.595 2.507 1.206 1.206 0 1 0 2.367.476 34.84 34.84 0 0 1 .556-2.344 1.21 1.21 0 0 0-.845-1.483zM5.732 51.811a34.124 34.124 0 0 1-.914-2.227 1.207 1.207 0 0 0-2.263.836c.296.803.625 1.604.977 2.383a1.207 1.207 0 1 0 2.2-.992zM1.597 34.103A1.206 1.206 0 0 0 .32 35.236a38.802 38.802 0 0 0-.067 2.266l.001.309a1.205 1.205 0 0 0 1.207 1.196h.01a1.208 1.208 0 0 0 1.198-1.216 35.658 35.658 0 0 1 .061-2.411 1.207 1.207 0 0 0-1.133-1.277zM20.039 67.648a34.98 34.98 0 0 1-2.041-1.28 1.208 1.208 0 0 0-1.353 1.999c.709.48 1.444.94 2.183 1.369a1.205 1.205 0 0 0 1.649-.438 1.207 1.207 0 0 0-.438-1.65zM9.365 58.045a34.067 34.067 0 0 1-1.354-1.994 1.207 1.207 0 0 0-2.041 1.288c.456.724.943 1.44 1.447 2.13a1.207 1.207 0 0 0 1.948-1.424zM14.205 63.399a35.108 35.108 0 0 1-1.735-1.673 1.207 1.207 0 1 0-1.733 1.68 38.136 38.136 0 0 0 1.854 1.787 1.209 1.209 0 0 0 1.704-.088 1.21 1.21 0 0 0-.09-1.706zM5.446 20.964a1.207 1.207 0 0 0-1.608.572c-.189.397-.375.806-.549 1.21a1.206 1.206 0 1 0 2.215.956c.163-.378.335-.76.514-1.132a1.207 1.207 0 0 0-.572-1.606z"/> </svg></div><div class="usp-content"><p>All your scraping projects are backed by us. Maintenance agreements and enterprise SLAs available to ensure long-term success.</p></div></div></div></div></div><div class="wrap experts-content"><p><a href="/enterprise-consultation">Ask any question</a></p></div> </section></div><div class="page-enterprise"> <section class="clients"><div class="wrap container-fluid clients-logos"><h4>We scrape the web for:</h4><div class="row"><div class="col-md-4 col-xs-12 logo"><img alt="" integrity="sha256-EGtlOPArNnvjwejf8A0NRk6C04OxAybCdpcZXi6QC78=" crossorigin="anonymous" src="/assets/client-logo-walmart-106b6538f02b367be3c1e8dff00d0d464e82d383b10326c27697195e2e900bbf.png"></div><div class="col-md-4 col-xs-12 logo"><img alt="" integrity="sha256-CaYAH3F2sayWHhrSNUaIhj42bKIRbaweanYsU+NIq7s=" crossorigin="anonymous" src="/assets/client-logo-logitech-09a6001f7176b1ac961e1ad2354688863e366ca2116dac1e6a762c53e348abbb.png"></div><div class="col-md-4 col-xs-12 logo"><img alt="" integrity="sha256-rQGQcpVvPkzR22W0jEIczuKosTN/cRdX3ZASDkNs0ZE=" crossorigin="anonymous" src="/assets/client-logo-lexisnexis-ad019072956f3e4cd1db65b48c421ccee2a8b1337f711757dd90120e436cd191.png"></div></div><div class="row"><div class="col-md-4 col-xs-12 logo"><img alt="" integrity="sha256-zBSfU+zDT22smsvKHWxKVC9jX4O1my+siADqGRrOvrg=" crossorigin="anonymous" src="/assets/client-logo-amazon-cc149f53ecc34f6dac9acbca1d6c4a542f635f83b59b2fac8800ea191acebeb8.png"></div><div class="col-md-4 col-xs-12 logo"><img alt="" integrity="sha256-aIf1tYQRWu7v6Qbuunj2HPQhFh5tAUu0xowQ9lRjdfc=" crossorigin="anonymous" src="/assets/client-logo-hubspot-6887f5b584115aeeefe906eeba78f61cf421161e6d014bb4c68c10f6546375f7.png"></div><div class="col-md-4 col-xs-12 logo"><img alt="" integrity="sha256-K2a25mvIMmYPeMWVCPQx6tnW0AHivSnLcH90y5Exd0A=" crossorigin="anonymous" src="/assets/client-logo-deloitte-2b66b6e66bc832660f78c59508f431ead9d6d001e2bd29cb707f74cb91317740.png"></div></div></div> </section></div><div class="page-enterprise"> <section class="section-ctas"><div class="wrap container-fluid"><div class="cta-items"><div class="section-ctas-link"><h3>Need web data?</h3> <a href="/contact">Contact us</a></div></div></div> </section></div><footer><div class="wrap container-fluid"><div class="row footer-content"><div class="col-md-4 col-xs-12"> <a href="/" class="scrapinghub-logo-footer"><?xml version="1.0"?> <svg xmlns="http://www.w3.org/2000/svg" width="138.19" height="25.36" viewBox="0 0 138.19 25.36"> <defs/><title>scrapinghub-letter-logo</title> <path d="M7.68,11c-2.27-.64-3.56-.85-3.56-1.75,0-.67,1.06-1.08,2.17-1.08A6.58,6.58,0,0,1,9.82,9.3l1.34-2.42A9.45,9.45,0,0,0,6.24,5.57c-3.15,0-5.64,1.6-5.64,4s2,3.17,4,3.81,3.66,1,3.66,2-.9,1.29-2.6,1.29a7.5,7.5,0,0,1-4.2-1.39L0,17.91a10.57,10.57,0,0,0,5.72,1.73c4,0,6-2.22,6-4.41S10,11.7,7.68,11Z"/> <path d="M19.82,16.47A3.67,3.67,0,0,1,16,12.53a3.75,3.75,0,0,1,3.79-3.81A4.06,4.06,0,0,1,22.81,9.9l1.88-2.35a7.19,7.19,0,0,0-5.26-2,6.85,6.85,0,0,0-7,7,6.78,6.78,0,0,0,7.09,7A7.24,7.24,0,0,0,25,17.58l-2-2.4A4.4,4.4,0,0,1,19.82,16.47Z"/> <path d="M32.93,5.59c-2.27,0-3.26,1.6-4.26,3V6h-3V19h4V11.75c0-2.24,1.49-2.89,3.94-2.89.13,0,.18,0,.33,0L34,5.62A7.88,7.88,0,0,0,32.93,5.59Z"/> <path d="M45.56,10c0-2.24-1.52-4.41-5.44-4.41A9.91,9.91,0,0,0,35.43,6.8l1.29,2.53a6.29,6.29,0,0,1,3.19-1c2.06,0,2.47,1.11,2.47,2.42,0,.08,0,.16,0,.23a5.36,5.36,0,0,0-3.07-1,4.73,4.73,0,0,0-5.05,4.87,4.75,4.75,0,0,0,4.84,4.77,4.67,4.67,0,0,0,3.63-1.42A3.88,3.88,0,0,0,43,19.59l3-.44c-.23-.41-.54-.88-.54-2.86Zm-5.64,6.88a2.21,2.21,0,0,1-2.45-2.06,2.11,2.11,0,0,1,2.42-2.14c1.73,0,2.6,1.06,2.6,2.11A2.33,2.33,0,0,1,39.91,16.86Z"/> <path d="M52.87,5.57A11.41,11.41,0,0,0,47.2,6.8a3.4,3.4,0,0,1,.46,1.85V24h3V18.89a7,7,0,0,0,3.5.75,6.5,6.5,0,0,0,6.72-6.8C60.89,8.45,57.69,5.57,52.87,5.57Zm.43,10.88c-1.39,0-2.64-.46-2.64-1.31V8.92a5.45,5.45,0,0,1,2.28-.36c2.09,0,4.38,1.21,4.38,4S55.37,16.44,53.3,16.44Z"/> <rect x="62.67" y="6.04" width="3" height="13"/> <path d="M64,0A2.06,2.06,0,1,0,66,2.06,2,2,0,0,0,64,0Z"/> <path d="M73.51,5.57a16.09,16.09,0,0,0-6.26,1c.36.52.42.31.42,1.85V19h3V8.92a12.34,12.34,0,0,1,2.67-.28c2.06,0,3.33,1,3.33,2.63V19h3v-8C79.67,7.35,77.2,5.57,73.51,5.57Z"/> <path d="M81,12.6a6.7,6.7,0,0,0,6.8,7c2.27,0,2.89-1.08,3.89-1.18v1.11c0,1.7-1.13,2.86-3.65,2.86a6.82,6.82,0,0,1-4-1.11L82.61,24a11.34,11.34,0,0,0,5.3,1.32c5.1,0,6.77-2.89,6.77-5.49V8.53c0-1.6,0-1.68.27-1.93a14.11,14.11,0,0,0-6-1C84.16,5.57,81,9,81,12.6Zm9.69,1.73c0,1.19-1,2.11-2.81,2.11a3.47,3.47,0,0,1-3.52-3.69A3.73,3.73,0,0,1,88.2,8.81c1.52,0,2.47.21,2.47.36Z"/> <path d="M103.83,5.57c-2.32,0-3.16,1.11-4.16,1.49V1h-3V19h3V9.79A4.91,4.91,0,0,1,103,8.53c1.7,0,2.7.8,2.7,2V19h3V10.21C108.67,8.27,107.73,5.57,103.83,5.57Z"/> <path d="M122.67,6h-3v7.87c0,1.67-1.71,2.68-3.23,2.68s-2.77-.8-2.77-2.37V6h-3v8.67c0,2.53,1.26,4.92,4.53,4.92,3.07,0,4-1.75,4.27-2.24a6.52,6.52,0,0,0,.41,2.11l3.32-.54c-.16-.31-.52-.9-.52-3.35Z"/> <path d="M131.33,5.57c-2.06,0-2.66.64-3.66.77V1h-3V16.62c0,.62,0,1.55-.29,1.86a12.28,12.28,0,0,0,5.82,1.16c4.46,0,8-2.42,8-7.22A6.7,6.7,0,0,0,131.33,5.57Zm-1.18,10.93c-1.26,0-2.48-.36-2.48-1V9.3a7.65,7.65,0,0,1,3.12-.72,3.93,3.93,0,0,1,3.76,3.63C134.55,15.31,133,16.49,130.14,16.49Z"/> </svg> </a><div class="company-info"><p>Cuil Greine House</p><p>Ballincollig Commercial Park, Link Road</p><p>Ballincollig, Co. Cork, Ireland</p><p>VAT Number IE 9787078K</p></div><div class="social-icons"><h4>Follow us</h4> <a href="https://twitter.com/scrapinghub" class="social-single"><img alt="" integrity="sha256-BQi8o7Iw8pTwulfzJZrzgtQCtScMVelHPT7tz8x1xhw=" crossorigin="anonymous" src="/assets/social-icons/twitter-0508bca3b230f294f0ba57f3259af382d402b5270c55e9473d3eedcfcc75c61c.svg"></a> <a href="https://github.com/scrapinghub/" class="social-single"><img alt="" integrity="sha256-lLaQ8MTsxZEKFt2A/oJw/15e4peqzQ/ai659+p+QkhA=" crossorigin="anonymous" src="/assets/social-icons/github-94b690f0c4ecc5910a16dd80fe8270ff5e5ee297aacd0fda8bae7dfa9f909210.svg"></a> <a href="https://www.linkedin.com/company/scrapinghub" class="social-single"><img alt="" integrity="sha256-hZU7/Wy5o0syiSmODptb199t6v+CLSDfmRyhQIsUzK8=" crossorigin="anonymous" src="/assets/social-icons/linkedin-85953bfd6cb9a34b3289298e0e9b5bd7df6deaff822d20df991ca1408b14ccaf.svg"></a> <a href="https://www.facebook.com/ScrapingHub/" class="social-single"><img alt="" integrity="sha256-v+GhMN7KWLUOTqJAPbcblqxf9BO5MnimOEuJe72LTjQ=" crossorigin="anonymous" src="/assets/social-icons/facebook-bfe1a130deca58b50e4ea2403db71b96ac5ff413b93278a6384b897bbd8b4e34.svg"></a> <a href="https://plus.google.com/+Scrapinghub" class="social-single"><img alt="" integrity="sha256-NjPTPn69kvBLfJKYrSrSBk+CZ0wq0Dyi2rynT0JNJrw=" crossorigin="anonymous" src="/assets/social-icons/google1-3633d33e7ebd92f04b7c9298ad2ad2064f82674c2ad03ca2dabca74f424d26bc.svg"></a> <a href="https://www.youtube.com/scrapinghub" class="social-single"><img alt="" integrity="sha256-lA+F2DqABQHvLdkiBecovOM4hToerNjjSopd3XY55Tc=" crossorigin="anonymous" src="/assets/social-icons/youtube-940f85d83a800501ef2dd92205e728bce338853a1eacd8e34a8a5ddd7639e537.svg"></a></div></div><div class="col-md-3 col-xs-12"></div><div class="col-md-5 col-xs-12 footer-nav"><div class="row"><div class="col-md-3 col-xs-12"><h4>Company</h4> <a href="/about">About us</a> <a href="/clients">Clients</a> <a href="/open-source">Open Source</a> <a href="/contact">Contact</a> <a href="/jobs">Jobs</a> <a href="/press">Press</a></div><div class="col-md-3 col-xs-12"><h4>Products</h4> <a href="/data-on-demand">Data on Demand</a> <a href="/crawlera">Proxy Network</a> <a href="/enterprise-solutions">Professional Services</a> <a href="https://learn.scrapinghub.com/scrapy/">Scrapy Training</a></div><div class="col-md-3 col-xs-12"><h4>Developers</h4> <a href="/scrapy-cloud">Scrapy Cloud</a> <a href="/crawlera">Crawlera</a> <a href="/splash">Splash</a></div><div class="col-md-3 col-xs-12"><h4>Resources</h4> <a href="/webinars">Webinars</a> <a href="https://blog.scrapinghub.com/">Blog</a> <a href="https://doc.scrapinghub.com/">Documentation</a> <a href="https://helpdesk.scrapinghub.com/support/home">Support &amp; KB</a> <a href="http://status.scrapinghub.com/">Status</a> <a href="/terms-of-service">Terms of Service</a> <a href="/abuse-report">Abuse Report</a> <a href="/privacy-policy">Privacy Policy</a> <a href="/cookie-policy">Cookie Policy</a></div></div></div></div><p class="copyright">© 2010-2017 Scrapinghub</p></div> </footer><div id="gdpr-footer"><div class="flex-container"><div class="left flex-item"><p>Scrapinghub uses cookies to enhance your experience, analyze our website traffic, and share information with our analytics partners. By using this website you consent to our use of cookies. For more information, please refer to our <a href="/cookie-policy">Cookie Policy</a>.</p></div><div class="right flex-item"> <a id="gdpr-accept-button"><p class="gdpr-cta">I Agree</p> </a></div></div></div><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/js-cookie/2.1.3/js.cookie.min.js"></script><script integrity="sha256-/Hii1f5maBS1o2TjVDiLnlW7yrGax8QTpBcBsL6vJZI=" crossorigin="anonymous" src="/assets/scripts-fc78a2d5fe666814b5a364e354388b9e55bbcab19ac7c413a41701b0beaf2592.js"></script><script integrity="sha256-I2Ld51vNP0gsMAmHjx3NSGRdbm9ykvURNN5vgGTc0f0=" crossorigin="anonymous" src="/assets/shapes-2362dde75bcd3f482c3009878f1dcd48645d6e6f7292f51134de6f8064dcd1fd.js"></script><script>$(".flip-container").on("touchstart",function(){$(".flipper").toggleClass("hover")})</script> <script>!function(t,h,e,j,n,s){t.hj=t.hj||function(){(t.hj.q=t.hj.q||[]).push(arguments)},t._hjSettings={hjid:62101,hjsv:5},n=h.getElementsByTagName("head")[0],s=h.createElement("script"),s.async=1,s.src=e+t._hjSettings.hjid+j+t._hjSettings.hjsv,n.appendChild(s)}(window,document,"//static.hotjar.com/c/hotjar-",".js?sv=")</script> <script>window.heap=window.heap||[],heap.load=function(e,t){window.heap.appid=e,window.heap.config=t=t||{};var r=t.forceSSL||"https:"===document.location.protocol,a=document.createElement("script");a.type="text/javascript",a.async=!0,a.src=(r?"https:":"http:")+"//cdn.heapanalytics.com/js/heap-"+e+".js";var n=document.getElementsByTagName("script")[0];n.parentNode.insertBefore(a,n);for(var o=function(e){return function(){heap.push([e].concat(Array.prototype.slice.call(arguments,0)))}},p=["addEventProperties","addUserProperties","clearEventProperties","identify","removeEventProperty","setEventProperties","track","unsetEventProperty"],c=0;c<p.length;c++)heap[p[c]]=o(p[c])},heap.load("2271566029")</script> <script id="hs-script-loader" async defer src="//js.hs-scripts.com/4367560.js"></script><script>$(function(){function e(){var e={};return location.search.substr(1).split("&").forEach(function(t){e[t.split("=")[0]]=t.split("=")[1]}),e}var t=e();["utm_source","utm_medium","utm_term","utm_content","utm_campaign"].forEach(function(e){t[e]&&Cookies.set(e,t[e],{expires:30})}),(null==Cookies.get("sh_referrer")||""==Cookies.get("sh_referrer"))&&Cookies.set("sh_referrer",document.referrer,{expires:30}),window.intercomSettings={app_id:"hcwc7e8c",sh_referrer:Cookies.get("sh_referrer"),utm_source:Cookies.get("utm_source"),utm_medium:Cookies.get("utm_medium"),utm_term:Cookies.get("utm_term"),utm_content:Cookies.get("utm_content"),utm_campaign:Cookies.get("utm_campaign")},function(){function e(){var e=o.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://widget.intercom.io/widget/hcwc7e8c";var t=o.getElementsByTagName("head")[0];t.appendChild(e)}var t=window,r=t.Intercom;if("function"==typeof r)r("reattach_activator"),r("update",intercomSettings);else{var o=document,n=function(){n.c(arguments)};n.q=[],n.c=function(e){n.q.push(e)},t.Intercom=n,t.attachEvent?t.attachEvent("onload",e):t.addEventListener("load",e,!1)}}()})</script><script>$(".responsive-menu").click(function(){$(".site-nav-mobile").toggleClass("mobile-active")}),$(".nav-item-wrapper.with-dropdown .nav-item-link-wrapper").click(function(){$(this).find("+ .dropdown-content").toggleClass("mobile-active")})</script><script>function TermsAndConditions(){days=30,myDate=new Date,myDate.setTime(myDate.getTime()+24*days*60*60*1e3),document.cookie="TermsAndConditions=Accepted;expires="+myDate.toGMTString()+";path=/"}var cookie=document.cookie.split(";").map(function(e){return e.trim().split("=")}).filter(function(e){return"TermsAndConditions"===e[0]}).pop();cookie&&"Accepted"===cookie[1]&&$("#gdpr-footer").hide(),$("#gdpr-accept-button").on("click",function(){return TermsAndConditions(),$("#gdpr-footer").hide(),!1})</script></body></html>
\ No newline at end of file
diff --git a/tests/test_webpages/Scrapinghub Enterprise Solutions.txt b/tests/test_webpages/Scrapinghub Enterprise Solutions.txt
new file mode 100644
index 0000000..d101b43
--- /dev/null
+++ b/tests/test_webpages/Scrapinghub Enterprise Solutions.txt	
@@ -0,0 +1,230 @@
+Scrapinghub Enterprise Solutions
+
+Enterprise Solutions
+Products
+
+Data on Demand
+
+Turn web content into useful data for your business
+
+Crawlera Smart Proxy
+
+A smart proxy that never gets banned and doesn't need IP rotation
+
+Professional Services
+
+The most experienced team from the market leaders in web scraping
+
+Scrapy Cloud
+
+Deploy and manage your Scrapy spiders with your web scraping team
+
+Scrapy Training
+
+Get your team trained on Scrapy, by the team that create Scrapy itself
+
+Developer Tools
+
+Scrapy Cloud
+
+Deploy and manage your Scrapy spiders with your web scraping team
+
+Crawlera Smart Proxy
+
+A smart proxy that never gets banned and doesn't need IP rotation
+
+Splash
+
+A full blown browser behind an API, to render pages and execute actions
+
+Pricing
+
+Data on Demand
+
+Crawlera
+
+Scrapy Cloud
+
+Splash
+
+Sign In
+
+hamburger
+
+Enterprise Solutions
+Products
+
+Data on Demand
+
+Turn web content into useful data for your business
+
+Crawlera Smart Proxy
+
+A smart proxy that never gets banned and doesn't need IP rotation
+
+Professional Services
+
+The most experienced team from the market leaders in web scraping
+
+Scrapy Cloud
+
+Deploy and manage your Scrapy spiders with your web scraping team
+
+Scrapy Training
+
+Get your team trained on Scrapy, by the team that create Scrapy itself
+
+Developer Tools
+
+Scrapy Cloud
+
+Deploy and manage your Scrapy spiders with your web scraping team
+
+Crawlera Smart Proxy
+
+A smart proxy that never gets banned and doesn't need IP rotation
+
+Splash
+
+A full blown browser behind an API, to render pages and execute actions
+
+Pricing
+
+Data on Demand
+
+Crawlera
+
+Scrapy Cloud
+
+Splash
+
+Sign In
+
+Enterprise Solutions
+
+Complete web scraping services for any size business, from startups to Fortune 100’s
+
+Tell us about your project
+
+Web data, hassle-free, for real business needs
+
+Get a free consultation
+
+From the world leading experts in web scraping
+
+Lead generation, competitor & sales intelligence
+
+Alternative data for finance, equity and market research
+
+Dark web, law enforcement & compliance
+
+Staffing, talent sourcing & job market research
+
+Product aggregation & price monitoring for retail, e-commerce & manufacturers
+
+Monitoring of ratings and reviews, sentiment analysis & social network intelligence
+
+Let’s Partner
+
+Team up with the best web scraping engineers while you stay focused on your business goals
+
+Quality assurance, enterprise service-level agreements and maintenance plans
+
+Full access to your project’s code with training and handover
+
+Money-back guarantee for your project
+
+Get in touch
+
+Data on Demand
+
+Any size scraping project. Data refreshed regularly, reliably and in the form you want
+
+Accuracy and coverage guarantees
+
+Scraped data from virtually any number of web pages
+
+Post processing and automated data crawling updates anytime
+
+Get in touch
+
+Data Science
+
+Enriched data for your business that goes beyond traditional web crawling needs
+
+Your raw web data post-processed for real insights
+
+Link data across disparate scraped pages
+
+Deduce sentiment on a large scale
+
+Get in touch
+
+Training
+
+Learn from the recognised experts in data crawling and scraping to grow your own in-house team
+
+One-to-one and group training
+
+Standard introduction to web scraping
+
+Tailored courses to help you solve very specific business challenges
+
+Get in touch
+
+Need some advice?
+
+The best web crawler team
+
+Authors of the #1 web crawling framework, the world’s most experienced team of engineers will help you get the very best results for your project.
+
+7 billion pages crawled on our platform
+per month
+
+We are the authors of the most popular open-source web scraping tools. You can be assured that our services are the best in class.
+
+100% money-back guarantee
+
+All your scraping projects are backed by us. Maintenance agreements and enterprise SLAs available to ensure long-term success.
+
+Ask any question
+
+We scrape the web for:
+
+Need web data?
+
+Contact us
+
+scrapinghub-letter-logo
+
+Cuil Greine House
+
+Ballincollig Commercial Park, Link Road
+
+Ballincollig, Co. Cork, Ireland
+
+VAT Number IE 9787078K
+
+Follow us
+
+Company
+
+About us Clients Open Source Contact Jobs Press
+
+Products
+
+Data on Demand Proxy Network Professional Services Scrapy Training
+
+Developers
+
+Scrapy Cloud Crawlera Splash
+
+Resources
+
+Webinars Blog Documentation Support & KB Status Terms of Service Abuse Report Privacy Policy Cookie Policy
+
+© 2010-2017 Scrapinghub
+
+Scrapinghub uses cookies to enhance your experience, analyze our website traffic, and share information with our analytics partners. By using this website you consent to our use of cookies. For more information, please refer to our Cookie Policy.
+
+I Agree
\ No newline at end of file
diff --git "a/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html" "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html"
new file mode 100644
index 0000000..a174bca
--- /dev/null
+++ "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.html"	
@@ -0,0 +1,590 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  <title>Tutorial &mdash; Webstruct 0.6 documentation</title>
+  
+
+  
+  
+  
+  
+
+  
+
+  
+  
+    
+
+  
+
+  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+    <link rel="index" title="Index" href="genindex.html" />
+    <link rel="search" title="Search" href="search.html" />
+    <link rel="next" title="Reference" href="ref/index.html" />
+    <link rel="prev" title="Webstruct" href="intro.html" /> 
+
+  
+  <script src="_static/js/modernizr.min.js"></script>
+
+
+<!-- RTD Extra Head -->
+
+<!-- 
+Always link to the latest version, as canonical.
+http://docs.readthedocs.org/en/latest/canonical.html
+-->
+<link rel="canonical" href="https://webstruct.readthedocs.io/en/latest/tutorial.html" />
+
+<link rel="stylesheet" href="https://media.readthedocs.org/css/readthedocs-doc-embed.css" type="text/css" />
+
+<script type="text/javascript" src="_static/readthedocs-data.js"></script>
+
+<!-- Add page-specific data, which must exist in the page js, not global -->
+<script type="text/javascript">
+READTHEDOCS_DATA['page'] = 'tutorial'
+READTHEDOCS_DATA['source_suffix'] = '.rst'
+</script>
+
+<script type="text/javascript" src="https://media.readthedocs.org/javascript/readthedocs-analytics.js"></script>
+
+<!-- end RTD <extrahead> -->
+</head>
+
+<body class="wy-body-for-nav">
+
+   
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search">
+          
+
+          
+            <a href="index.html" class="icon icon-home"> Webstruct
+          
+
+          
+          </a>
+
+          
+            
+            
+            
+              <div class="version">
+                latest
+              </div>
+            
+          
+
+          
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+
+          
+        </div>
+
+        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+          
+            
+            
+              
+            
+            
+              <ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="intro.html">Webstruct</a></li>
+<li class="toctree-l1 current"><a class="current reference internal" href="#">Tutorial</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="#get-annotated-data">Get annotated data</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#from-html-to-tokens">From HTML to Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#feature-extraction">Feature Extraction</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#using-a-sequence-labelling-toolkit">Using a Sequence Labelling Toolkit</a><ul>
+<li class="toctree-l3"><a class="reference internal" href="#defining-a-model">Defining a Model</a></li>
+<li class="toctree-l3"><a class="reference internal" href="#training">Training</a></li>
+</ul>
+</li>
+<li class="toctree-l2"><a class="reference internal" href="#named-entity-recognition">Named Entity Recognition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#entity-grouping">Entity Grouping</a></li>
+<li class="toctree-l2"><a class="reference internal" href="#model-development">Model Development</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="ref/index.html">Reference</a></li>
+<li class="toctree-l1"><a class="reference internal" href="changes.html">Changes</a></li>
+</ul>
+
+            
+          
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" aria-label="top navigation">
+        
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="index.html">Webstruct</a>
+        
+      </nav>
+
+
+      <div class="wy-nav-content">
+        
+        <div class="rst-content">
+        
+          
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+
+  <ul class="wy-breadcrumbs">
+    
+      <li><a href="index.html">Docs</a> &raquo;</li>
+        
+      <li>Tutorial</li>
+    
+    
+      <li class="wy-breadcrumbs-aside">
+        
+            
+            
+              <a href="https://github.com/scrapinghub/webstruct/blob/master/docs/tutorial.rst" class="fa fa-github"> Edit on GitHub</a>
+            
+          
+        
+      </li>
+    
+  </ul>
+
+  
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            
+  <div class="section" id="tutorial">
+<h1>Tutorial<a class="headerlink" href="#tutorial" title="Permalink to this headline">¶</a></h1>
+<p>This tutorial assumes you are familiar with machine learning.</p>
+<div class="section" id="get-annotated-data">
+<h2>Get annotated data<a class="headerlink" href="#get-annotated-data" title="Permalink to this headline">¶</a></h2>
+<p>First, you need the training/development data. We suggest to use
+<a class="reference external" href="https://github.com/xtannier/WebAnnotator">WebAnnotator</a> Firefox extension to annotate HTML pages.</p>
+<p>Recommended WebAnnotator options:</p>
+<img alt="_images/wa-options.png" src="_images/wa-options.png" />
+<p>Pro tip - enable WebAnnotator toolbar buttons:</p>
+<img alt="_images/wa-buttons.png" src="_images/wa-buttons.png" />
+<p>Follow WebAnnotator <a class="reference external" href="http://perso.limsi.fr/xtannier/en/WebAnnotator/">manual</a>
+to define named entities and annotate some web pages (nested WebAnnotator
+entities are not supported). Use “Save as..” menu item or “Save as”
+toolbar button to save the results; don’t use “Export as”.</p>
+<p>After that you can load annotated webpages as lxml trees:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">webstruct</span>
+<span class="n">trees</span> <span class="o">=</span> <span class="n">webstruct</span><span class="o">.</span><span class="n">load_trees</span><span class="p">(</span><span class="s2">&quot;train/*.html&quot;</span><span class="p">,</span> <span class="n">webstruct</span><span class="o">.</span><span class="n">WebAnnotatorLoader</span><span class="p">())</span>
+</pre></div>
+</div>
+<p>See <a class="reference internal" href="ref/loaders.html#html-loaders"><span class="std std-ref">HTML Loaders</span></a> for more info.
+<a class="reference external" href="http://gate.ac.uk/">GATE</a> annotation format is also supported.</p>
+</div>
+<div class="section" id="from-html-to-tokens">
+<h2>From HTML to Tokens<a class="headerlink" href="#from-html-to-tokens" title="Permalink to this headline">¶</a></h2>
+<p>To convert HTML trees to a format suitable for sequence prediction algorithm
+(like CRF, MEMM or Structured Perceptron) the following approach is used:</p>
+<ol class="arabic simple">
+<li>Text is extracted from HTML and split into tokens.</li>
+<li>For each token a special <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlToken" title="webstruct.html_tokenizer.HtmlToken"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlToken</span></code></a> instance is created. It
+contains information not only about the text token itself, but also about
+its position in HTML tree.</li>
+</ol>
+<p>A single HTML page corresponds to a single input sequence
+(a list of HtmlTokens). For training/testing data
+(where webpages are already annotated) there is also a list of labels for
+each webpage, a label per HtmlToken.</p>
+<p>To transform HTML trees into labels and HTML tokens
+use <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlTokenizer" title="webstruct.html_tokenizer.HtmlTokenizer"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlTokenizer</span></code></a>.</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">html_tokenizer</span> <span class="o">=</span> <span class="n">webstruct</span><span class="o">.</span><span class="n">HtmlTokenizer</span><span class="p">()</span>
+<span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">html_tokenizer</span><span class="o">.</span><span class="n">tokenize</span><span class="p">(</span><span class="n">trees</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>Input trees should be loaded by one of the WebStruct loaders.
+For consistency, for each tree (even if it is loaded from raw unannotated html)
+<a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlTokenizer" title="webstruct.html_tokenizer.HtmlTokenizer"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlTokenizer</span></code></a> extracts two arrays: a list of <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlToken" title="webstruct.html_tokenizer.HtmlToken"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlToken</span></code></a>
+instances and a list of tags encoded using <a class="reference external" href="http://en.wikipedia.org/wiki/Inside_Outside_Beginning">IOB2</a> encoding
+(also known as BIO encoding). So in our example <code class="docutils literal notranslate"><span class="pre">X</span></code> is a list of
+lists of <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlToken" title="webstruct.html_tokenizer.HtmlToken"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlToken</span></code></a> instances, and  <code class="docutils literal notranslate"><span class="pre">y</span></code> is a list of lists
+of strings.</p>
+</div>
+<div class="section" id="feature-extraction">
+<h2>Feature Extraction<a class="headerlink" href="#feature-extraction" title="Permalink to this headline">¶</a></h2>
+<p>For supervised machine learning algorithms to work we need to extract
+<a class="reference external" href="http://en.wikipedia.org/wiki/Features_%28pattern_recognition%29">features</a>.</p>
+<p>In WebStruct feature vectors are Python dicts
+<code class="docutils literal notranslate"><span class="pre">{&quot;feature_name&quot;:</span> <span class="pre">&quot;feature_value&quot;}</span></code>; a dict is computed for
+each HTML token. How to convert these dicts into representation required
+by a sequence labelling toolkit depends on a toolkit used; we will cover
+that later.</p>
+<p>To compute feature dicts we’ll use <a class="reference internal" href="ref/features.html#webstruct.feature_extraction.HtmlFeatureExtractor" title="webstruct.feature_extraction.HtmlFeatureExtractor"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlFeatureExtractor</span></code></a>.</p>
+<p>First, define your feature functions. A feature function should take
+an <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlToken" title="webstruct.html_tokenizer.HtmlToken"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlToken</span></code></a> instance and return a feature dict;
+feature dicts from individual feature functions will be merged
+into the final feature dict for a token. Feature functions can ask questions
+about token itself, its neighbours (in the same HTML element),
+its position in HTML.</p>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">WebStruct supports other kind of feature functions that work on multiple
+tokens; we don’t cover them in this tutorial.</p>
+</div>
+<p>There are predefined feature functions in <a class="reference internal" href="ref/features.html#module-webstruct.features" title="webstruct.features"><code class="xref py py-mod docutils literal notranslate"><span class="pre">webstruct.features</span></code></a>,
+but for this tutorial let’s create some functions ourselves:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">token_identity</span><span class="p">(</span><span class="n">html_token</span><span class="p">):</span>
+    <span class="k">return</span> <span class="p">{</span><span class="s1">&#39;token&#39;</span><span class="p">:</span> <span class="n">html_token</span><span class="o">.</span><span class="n">token</span><span class="p">}</span>
+
+<span class="k">def</span> <span class="nf">token_isupper</span><span class="p">(</span><span class="n">html_token</span><span class="p">):</span>
+    <span class="k">return</span> <span class="p">{</span><span class="s1">&#39;isupper&#39;</span><span class="p">:</span> <span class="n">html_token</span><span class="o">.</span><span class="n">token</span><span class="o">.</span><span class="n">isupper</span><span class="p">()}</span>
+
+<span class="k">def</span> <span class="nf">parent_tag</span><span class="p">(</span><span class="n">html_token</span><span class="p">):</span>
+    <span class="k">return</span> <span class="p">{</span><span class="s1">&#39;parent_tag&#39;</span><span class="p">:</span> <span class="n">html_token</span><span class="o">.</span><span class="n">parent</span><span class="o">.</span><span class="n">tag</span><span class="p">}</span>
+
+<span class="k">def</span> <span class="nf">border_at_left</span><span class="p">(</span><span class="n">html_token</span><span class="p">):</span>
+    <span class="k">return</span> <span class="p">{</span><span class="s1">&#39;border_at_left&#39;</span><span class="p">:</span> <span class="n">html_token</span><span class="o">.</span><span class="n">index</span> <span class="o">==</span> <span class="mi">0</span><span class="p">}</span>
+</pre></div>
+</div>
+<p>Next, create <a class="reference internal" href="ref/features.html#webstruct.feature_extraction.HtmlFeatureExtractor" title="webstruct.feature_extraction.HtmlFeatureExtractor"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlFeatureExtractor</span></code></a>:</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">feature_extractor</span> <span class="o">=</span> <span class="n">HtmlFeatureExtractor</span><span class="p">(</span>
+    <span class="n">token_features</span> <span class="o">=</span> <span class="p">[</span>
+        <span class="n">token_identity</span><span class="p">,</span>
+        <span class="n">token_isupper</span><span class="p">,</span>
+        <span class="n">parent_tag</span><span class="p">,</span>
+        <span class="n">border_at_left</span>
+    <span class="p">]</span>
+<span class="p">)</span>
+</pre></div>
+</div>
+<p>and use it to extract feature dicts:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">features</span> <span class="o">=</span> <span class="n">feature_extractor</span><span class="o">.</span><span class="n">fit_transform</span><span class="p">(</span><span class="n">X</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>See <a class="reference internal" href="ref/features.html#feature-extraction"><span class="std std-ref">Feature Extraction</span></a> for more info about HTML tokenization and
+feature extraction.</p>
+</div>
+<div class="section" id="using-a-sequence-labelling-toolkit">
+<h2>Using a Sequence Labelling Toolkit<a class="headerlink" href="#using-a-sequence-labelling-toolkit" title="Permalink to this headline">¶</a></h2>
+<p>WebStruct doesn’t provide a CRF or Structured Perceptron implementation;
+learning and prediction is supposed to be handled by an external
+sequence labelling toolkit like <a class="reference external" href="http://www.chokkan.org/software/crfsuite/">CRFSuite</a>, <a class="reference external" href="http://wapiti.limsi.fr">Wapiti</a> or <a class="reference external" href="https://github.com/larsmans/seqlearn">seqlearn</a>.</p>
+<p>Once feature dicts are extracted from HTML you should convert them to
+a format required by your sequence labelling tooklit and use this toolkit
+to train a model and do the prediction. For example, you may use
+DictVectorizer from scikit-learn to convert feature dicts
+into <a class="reference external" href="https://github.com/larsmans/seqlearn">seqlearn</a> input format.</p>
+<p>We’ll use <a class="reference external" href="http://www.chokkan.org/software/crfsuite/">CRFSuite</a> in this tutorial.</p>
+<p>WebStruct provides some helpers for CRFSuite sequence labelling toolkit.
+To use CRFSuite with WebStruct, you need</p>
+<ul class="simple">
+<li>sklearn-crfsuite package (which depends on python-crfsuite and sklearn)</li>
+</ul>
+<div class="section" id="defining-a-model">
+<h3>Defining a Model<a class="headerlink" href="#defining-a-model" title="Permalink to this headline">¶</a></h3>
+<p>Basic way to define CRF model is the following:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model</span> <span class="o">=</span> <span class="n">webstruct</span><span class="o">.</span><span class="n">create_crfsuite_pipeline</span><span class="p">(</span>
+        <span class="n">token_features</span><span class="o">=</span><span class="p">[</span><span class="n">token_identity</span><span class="p">,</span> <span class="n">token_isupper</span><span class="p">,</span> <span class="n">parent_tag</span><span class="p">,</span> <span class="n">border_at_left</span><span class="p">],</span>
+        <span class="n">verbose</span><span class="o">=</span><span class="kc">True</span>
+    <span class="p">)</span>
+</pre></div>
+</div>
+<p>First <a class="reference internal" href="ref/crfsuite.html#webstruct.crfsuite.create_crfsuite_pipeline" title="webstruct.crfsuite.create_crfsuite_pipeline"><code class="xref py py-func docutils literal notranslate"><span class="pre">create_crfsuite_pipeline()</span></code></a> argument is a list of feature functions which will be used for training.
+<code class="docutils literal notranslate"><span class="pre">verbose</span></code> is a boolean parameter enabling verbose output of various training information;
+check sklearn-crfsuite <a class="reference external" href="https://sklearn-crfsuite.readthedocs.io/en/latest/api.html#sklearn_crfsuite.CRF">API reference</a>
+for available options.</p>
+<p>Under the hood <a class="reference internal" href="ref/crfsuite.html#webstruct.crfsuite.create_crfsuite_pipeline" title="webstruct.crfsuite.create_crfsuite_pipeline"><code class="xref py py-func docutils literal notranslate"><span class="pre">create_crfsuite_pipeline()</span></code></a> creates a
+<code class="docutils literal notranslate"><span class="pre">sklearn.pipeline.Pipeline</span></code> with an <a class="reference internal" href="ref/features.html#webstruct.feature_extraction.HtmlFeatureExtractor" title="webstruct.feature_extraction.HtmlFeatureExtractor"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlFeatureExtractor</span></code></a> instance
+followed by <code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn_crfsuite.CRF</span></code> instance. The example above is just a shortcut
+for the following:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model</span> <span class="o">=</span> <span class="n">Pipeline</span><span class="p">([</span>
+    <span class="p">(</span><span class="s1">&#39;fe&#39;</span><span class="p">,</span> <span class="n">HtmlFeatureExtractor</span><span class="p">(</span>
+        <span class="n">token_features</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">token_identity</span><span class="p">,</span>
+            <span class="n">token_isupper</span><span class="p">,</span>
+            <span class="n">parent_tag</span><span class="p">,</span>
+            <span class="n">border_at_left</span><span class="p">,</span>
+        <span class="p">]</span>
+    <span class="p">)),</span>
+    <span class="p">(</span><span class="s1">&#39;crf&#39;</span><span class="p">,</span> <span class="n">sklearn_crfsuite</span><span class="o">.</span><span class="n">CRF</span><span class="p">(</span>
+        <span class="n">verbose</span><span class="o">=</span><span class="kc">True</span>
+    <span class="p">)),</span>
+<span class="p">])</span>
+</pre></div>
+</div>
+</div>
+<div class="section" id="training">
+<h3>Training<a class="headerlink" href="#training" title="Permalink to this headline">¶</a></h3>
+<p>To train a model use its <code class="docutils literal notranslate"><span class="pre">fit</span></code> method:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
+</pre></div>
+</div>
+<p><code class="docutils literal notranslate"><span class="pre">X</span></code> and <code class="docutils literal notranslate"><span class="pre">y</span></code> are return values of <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlTokenizer.tokenize" title="webstruct.html_tokenizer.HtmlTokenizer.tokenize"><code class="xref py py-meth docutils literal notranslate"><span class="pre">HtmlTokenizer.tokenize()</span></code></a>
+(a list of lists of <a class="reference internal" href="ref/features.html#webstruct.html_tokenizer.HtmlToken" title="webstruct.html_tokenizer.HtmlToken"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlToken</span></code></a> instances and a list of
+lists of string IOB labels).</p>
+<p>If you use <code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn_crfsuite.CRF</span></code> directly then train it using
+<code class="xref py py-meth docutils literal notranslate"><span class="pre">CRF.fit()</span></code> method. It accepts 2 lists: a list of lists of
+feature dicts, and a list of lists of tags:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">model</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="n">features</span><span class="p">,</span> <span class="n">y</span><span class="p">)</span>
+</pre></div>
+</div>
+</div>
+</div>
+<div class="section" id="named-entity-recognition">
+<h2>Named Entity Recognition<a class="headerlink" href="#named-entity-recognition" title="Permalink to this headline">¶</a></h2>
+<p>Once you got a trained model you can use it to extract entities
+from unseen (unannotated) webpages. First, get some binary HTML data:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">urllib2</span>
+<span class="gp">&gt;&gt;&gt; </span><span class="n">html</span> <span class="o">=</span> <span class="n">urllib2</span><span class="o">.</span><span class="n">urlopen</span><span class="p">(</span><span class="s2">&quot;http://scrapinghub.com/contact&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
+</pre></div>
+</div>
+<p>Then create a <a class="reference internal" href="ref/model.html#webstruct.model.NER" title="webstruct.model.NER"><code class="xref py py-class docutils literal notranslate"><span class="pre">NER</span></code></a> instance initialized with a trained model:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ner</span> <span class="o">=</span> <span class="n">webstruct</span><span class="o">.</span><span class="n">NER</span><span class="p">(</span><span class="n">model</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>The <code class="docutils literal notranslate"><span class="pre">model</span></code> must provide a <code class="docutils literal notranslate"><span class="pre">predict</span></code> method that extracts features
+from HTML tokens and predicts labels for these tokens. A pipeline created with
+<a class="reference internal" href="ref/crfsuite.html#webstruct.crfsuite.create_crfsuite_pipeline" title="webstruct.crfsuite.create_crfsuite_pipeline"><code class="xref py py-func docutils literal notranslate"><span class="pre">create_crfsuite_pipeline()</span></code></a> function fits this definition.</p>
+<p>Finally, use <a class="reference internal" href="ref/model.html#webstruct.model.NER.extract" title="webstruct.model.NER.extract"><code class="xref py py-meth docutils literal notranslate"><span class="pre">NER.extract()</span></code></a> method to extract entities:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ner</span><span class="o">.</span><span class="n">extract</span><span class="p">(</span><span class="n">html</span><span class="p">)</span>
+<span class="go">[(&#39;Scrapinghub&#39;, &#39;ORG&#39;), ..., (&#39;Iturriaga 3429 ap. 1&#39;, &#39;STREET&#39;), ...]</span>
+</pre></div>
+</div>
+<p>Generally, the steps are:</p>
+<ol class="arabic simple">
+<li>Load data using <a class="reference internal" href="ref/loaders.html#webstruct.loaders.HtmlLoader" title="webstruct.loaders.HtmlLoader"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlLoader</span></code></a> loader. If a custom HTML cleaner
+was used for loading training data make sure to apply it here as well.</li>
+<li>Use the same <code class="docutils literal notranslate"><span class="pre">html_tokenizer</span></code> as used for training to extract HTML tokens
+from loaded trees. All labels would be “O” when using <a class="reference internal" href="ref/loaders.html#webstruct.loaders.HtmlLoader" title="webstruct.loaders.HtmlLoader"><code class="xref py py-class docutils literal notranslate"><span class="pre">HtmlLoader</span></code></a>
+loader - <code class="docutils literal notranslate"><span class="pre">y</span></code> can be discarded.</li>
+<li>Use the same <code class="docutils literal notranslate"><span class="pre">feature_extractor</span></code> as used for training to extract
+features.</li>
+<li>Run <code class="docutils literal notranslate"><span class="pre">your_crf.predict()</span></code> method (e.g. <code class="xref py py-meth docutils literal notranslate"><span class="pre">CRF.predict()</span></code>)
+on features extracted in (3) to get the prediction - a list of IOB2-encoded
+tags for each input document.</li>
+<li>Build entities from input tokens based on predicted tags
+(check <a class="reference internal" href="ref/misc.html#webstruct.sequence_encoding.IobEncoder.group" title="webstruct.sequence_encoding.IobEncoder.group"><code class="xref py py-meth docutils literal notranslate"><span class="pre">IobEncoder.group()</span></code></a> and <a class="reference internal" href="ref/misc.html#webstruct.utils.smart_join" title="webstruct.utils.smart_join"><code class="xref py py-func docutils literal notranslate"><span class="pre">smart_join()</span></code></a>).</li>
+<li>Split entities into groups (optional). One way to do it is to use
+<a class="reference internal" href="ref/grouping.html#module-webstruct.grouping" title="webstruct.grouping"><code class="xref py py-mod docutils literal notranslate"><span class="pre">webstruct.grouping</span></code></a>.</li>
+</ol>
+<p><a class="reference internal" href="ref/model.html#webstruct.model.NER" title="webstruct.model.NER"><code class="xref py py-class docutils literal notranslate"><span class="pre">NER</span></code></a> helper class combines HTML loading, HTML tokenization,
+feature extraction, CRF model, entity building and grouping.</p>
+</div>
+<div class="section" id="entity-grouping">
+<h2>Entity Grouping<a class="headerlink" href="#entity-grouping" title="Permalink to this headline">¶</a></h2>
+<p>Detecting entities on their own is not always enough; in many cases
+what is wanted is to find the relationship between them. For example,
+“<strong>street_name/STREET city_name/CITY zipcode_number/ZIPCODE</strong>
+form an address”, or “<strong>phone/TEL</strong> is a phone of <strong>person/PER</strong>”.</p>
+<p>The first approximation is to say that all entities from a single webpage
+are related. For example, if we have extracted some <strong>organizaion/ORG</strong> and some
+<strong>phone/TEL</strong> from a single webpage we may assume that the phone
+is a contact phone of the organization.</p>
+<p>Sometimes there are several “entity groups” on a webpage. If a page
+contains contact phones of several persons or several business locations
+it is better to split all entities into groups of related
+entities - “person name + his/her phone(s)” or “address”.</p>
+<p>WebStruct provides an <a class="reference internal" href="ref/grouping.html#grouping-algorithm"><span class="std std-ref">unsupervised algorithm</span></a>
+for extracting such entity groups. Algorithm prefers to build
+large groups without entities of duplicate types; if a split is needed
+algorithm tries to split at points where distance between entities is larger.</p>
+<p>Use <a class="reference internal" href="ref/model.html#webstruct.model.NER.extract_groups" title="webstruct.model.NER.extract_groups"><code class="xref py py-meth docutils literal notranslate"><span class="pre">NER.extract_groups()</span></code></a> to extract groups of entities:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">ner</span><span class="o">.</span><span class="n">extract_groups</span><span class="p">(</span><span class="n">html</span><span class="p">)</span>
+<span class="go">[[...], ... [(&#39;Iturriaga 3429 ap. 1&#39;, &#39;STREET&#39;), (&#39;Montevideo&#39;, &#39;CITY&#39;), ...]]</span>
+</pre></div>
+</div>
+<p>Sometimes it is better to allow some entity types to appear
+multuple times in a group. For example, a person (PER entity) may have
+several contact phones and faxes (TEL and FAX entities) - we should penalize
+groups with multiple PERs, but multiple TELs and FAXes are fine.
+Use <code class="docutils literal notranslate"><span class="pre">dont_penalize</span></code> argument if you want to allow some entity types
+to appear multiple times in a group:</p>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">ner</span><span class="o">.</span><span class="n">extract_groups</span><span class="p">(</span><span class="n">html</span><span class="p">,</span> <span class="n">dont_penalize</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;TEL&#39;</span><span class="p">,</span> <span class="s1">&#39;FAX&#39;</span><span class="p">})</span>
+</pre></div>
+</div>
+<p>The simple algorithm WebStruct provides is by no means a general solution
+to relation detection, but give it a try - maybe it is enough for your task.</p>
+</div>
+<div class="section" id="model-development">
+<h2>Model Development<a class="headerlink" href="#model-development" title="Permalink to this headline">¶</a></h2>
+<p>To develop the model you need to choose the learning algorithm,
+features, hyperparameters, etc. To do that you need scoring metrics,
+cross-validation utilities and tools for debugging what classifier learned.
+WebStruct helps in the following way:</p>
+<ol class="arabic">
+<li><p class="first">Pipeline created by <a class="reference internal" href="ref/crfsuite.html#webstruct.crfsuite.create_crfsuite_pipeline" title="webstruct.crfsuite.create_crfsuite_pipeline"><code class="xref py py-func docutils literal notranslate"><span class="pre">create_crfsuite_pipeline()</span></code></a> is compatible with
+<a class="reference external" href="http://scikit-learn.org/stable/modules/cross_validation.html">cross-validation</a> and <a class="reference external" href="http://scikit-learn.org/stable/modules/grid_search.html">grid search</a> utilities from scikit-learn;
+use them to select model parameters and check the quality.</p>
+<p>One limitation of <a class="reference internal" href="ref/crfsuite.html#webstruct.crfsuite.create_crfsuite_pipeline" title="webstruct.crfsuite.create_crfsuite_pipeline"><code class="xref py py-func docutils literal notranslate"><span class="pre">create_crfsuite_pipeline()</span></code></a> is that <code class="docutils literal notranslate"><span class="pre">n_jobs</span></code>
+in scikit-learn functions and classes should be 1, but other than that
+WebStruct objects should work fine with scikit-learn. Just keep in mind
+that for WebStruct an “observation” is a document, not an individual token,
+and a “label” is a sequence of labels for a document, not an individual
+IOB tag.</p>
+</li>
+<li><p class="first">There is <a class="reference internal" href="ref/metrics.html#module-webstruct.metrics" title="webstruct.metrics"><code class="xref py py-mod docutils literal notranslate"><span class="pre">webstruct.metrics</span></code></a> module with a couple of metrics useful
+for sequence classification.</p>
+</li>
+</ol>
+<p>To debug what CRFSuite learned you could use <a class="reference external" href="https://github.com/TeamHG-Memex/eli5">eli5</a> library. With eli5 it would be two calls to
+<code class="xref py py-func docutils literal notranslate"><span class="pre">eli5.explain_weights()</span></code> and <code class="xref py py-func docutils literal notranslate"><span class="pre">eli5.format_as_html()</span></code> with <code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn_crfsuite.CRF</span></code> instance as argument.
+As a result you will get transitions and feature weights.</p>
+</div>
+</div>
+
+
+           </div>
+           
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+      
+        <a href="ref/index.html" class="btn btn-neutral float-right" title="Reference" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
+      
+      
+        <a href="intro.html" class="btn btn-neutral" title="Webstruct" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>
+        &copy; Copyright 2014-2017, Scrapinghub Inc..
+      
+        <span class="commit">
+          Revision <code>9e461566</code>.
+        </span>
+      
+
+    </p>
+  </div>
+  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 
+
+</footer>
+
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  
+
+  <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      <span class="fa fa-book"> Read the Docs</span>
+      v: latest
+      <span class="fa fa-caret-down"></span>
+    </span>
+    <div class="rst-other-versions">
+      <dl>
+        <dt>Versions</dt>
+        
+          <dd><a href="/en/latest/">latest</a></dd>
+        
+          <dd><a href="/en/stable/">stable</a></dd>
+        
+          <dd><a href="/en/0.6/">0.6</a></dd>
+        
+          <dd><a href="/en/0.5/">0.5</a></dd>
+        
+          <dd><a href="/en/0.4.1/">0.4.1</a></dd>
+        
+          <dd><a href="/en/0.4/">0.4</a></dd>
+        
+          <dd><a href="/en/0.3/">0.3</a></dd>
+        
+          <dd><a href="/en/0.2/">0.2</a></dd>
+        
+      </dl>
+      <dl>
+        <dt>Downloads</dt>
+        
+          <dd><a href="//readthedocs.org/projects/webstruct/downloads/pdf/latest/">pdf</a></dd>
+        
+          <dd><a href="//readthedocs.org/projects/webstruct/downloads/htmlzip/latest/">htmlzip</a></dd>
+        
+          <dd><a href="//readthedocs.org/projects/webstruct/downloads/epub/latest/">epub</a></dd>
+        
+      </dl>
+      <dl>
+        <dt>On Read the Docs</dt>
+          <dd>
+            <a href="//readthedocs.org/projects/webstruct/?fromdocs=webstruct">Project Home</a>
+          </dd>
+          <dd>
+            <a href="//readthedocs.org/builds/webstruct/?fromdocs=webstruct">Builds</a>
+          </dd>
+      </dl>
+      <hr/>
+      Free document hosting provided by <a href="http://www.readthedocs.org">Read the Docs</a>.
+
+    </div>
+  </div>
+
+
+
+  
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'./',
+            VERSION:'0.6',
+            LANGUAGE:'en',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'.html',
+            HAS_SOURCE:  true,
+            SOURCELINK_SUFFIX: '.txt'
+        };
+    </script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/jquery/jquery-2.0.3.min.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/jquery/jquery-migrate-1.2.1.min.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/underscore.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/doctools.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/readthedocs-doc-embed.js"></script>
+
+  
+
+  <script type="text/javascript" src="_static/js/theme.js"></script>
+
+  <script type="text/javascript">
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script> 
+
+</body>
+</html>
\ No newline at end of file
diff --git "a/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt" "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt"
new file mode 100644
index 0000000..0431385
--- /dev/null
+++ "b/tests/test_webpages/Tutorial \342\200\224 Webstruct 0.6 documentation.txt"	
@@ -0,0 +1,214 @@
+Tutorial — Webstruct 0.6 documentation
+
+Webstruct
+latest
+
+Webstruct
+Tutorial
+
+Get annotated data
+From HTML to Tokens
+Feature Extraction
+Using a Sequence Labelling Toolkit
+
+Defining a Model
+Training
+
+Named Entity Recognition
+Entity Grouping
+Model Development
+
+Reference
+Changes
+
+Webstruct
+
+Docs »
+Tutorial
+Edit on GitHub
+
+Tutorial ¶
+
+This tutorial assumes you are familiar with machine learning.
+
+Get annotated data ¶
+
+First, you need the training/development data. We suggest to use WebAnnotator Firefox extension to annotate HTML pages.
+
+Recommended WebAnnotator options:
+
+Pro tip - enable WebAnnotator toolbar buttons:
+
+Follow WebAnnotator manual to define named entities and annotate some web pages (nested WebAnnotator entities are not supported). Use “Save as..” menu item or “Save as” toolbar button to save the results; don’t use “Export as”.
+
+After that you can load annotated webpages as lxml trees:
+
+import webstruct trees = webstruct. load_trees ("train/*.html", webstruct. WebAnnotatorLoader ())
+
+See HTML Loaders for more info. GATE annotation format is also supported.
+
+From HTML to Tokens ¶
+
+To convert HTML trees to a format suitable for sequence prediction algorithm (like CRF, MEMM or Structured Perceptron) the following approach is used:
+
+Text is extracted from HTML and split into tokens.
+For each token a special HtmlToken instance is created. It contains information not only about the text token itself, but also about its position in HTML tree.
+
+A single HTML page corresponds to a single input sequence (a list of HtmlTokens). For training/testing data (where webpages are already annotated) there is also a list of labels for each webpage, a label per HtmlToken.
+
+To transform HTML trees into labels and HTML tokens use HtmlTokenizer.
+
+html_tokenizer = webstruct. HtmlTokenizer () X, y = html_tokenizer. tokenize (trees)
+
+Input trees should be loaded by one of the WebStruct loaders. For consistency, for each tree (even if it is loaded from raw unannotated html) HtmlTokenizer extracts two arrays: a list of HtmlToken instances and a list of tags encoded using IOB2 encoding (also known as BIO encoding). So in our example X is a list of lists of HtmlToken instances, and y is a list of lists of strings.
+
+Feature Extraction ¶
+
+For supervised machine learning algorithms to work we need to extract features.
+
+In WebStruct feature vectors are Python dicts {"feature_name":"feature_value"}; a dict is computed for each HTML token. How to convert these dicts into representation required by a sequence labelling toolkit depends on a toolkit used; we will cover that later.
+
+To compute feature dicts we’ll use HtmlFeatureExtractor.
+
+First, define your feature functions. A feature function should take an HtmlToken instance and return a feature dict; feature dicts from individual feature functions will be merged into the final feature dict for a token. Feature functions can ask questions about token itself, its neighbours (in the same HTML element), its position in HTML.
+
+Note
+
+WebStruct supports other kind of feature functions that work on multiple tokens; we don’t cover them in this tutorial.
+
+There are predefined feature functions in webstruct.features, but for this tutorial let’s create some functions ourselves:
+
+def token_identity (html_token): return { 'token': html_token. token } def token_isupper (html_token): return { 'isupper': html_token. token. isupper ()} def parent_tag (html_token): return { 'parent_tag': html_token. parent. tag } def border_at_left (html_token): return { 'border_at_left': html_token. index == 0 }
+
+Next, create HtmlFeatureExtractor:
+
+feature_extractor = HtmlFeatureExtractor (token_features = [ token_identity, token_isupper, parent_tag, border_at_left ])
+
+and use it to extract feature dicts:
+
+features = feature_extractor. fit_transform (X)
+
+See Feature Extraction for more info about HTML tokenization and feature extraction.
+
+Using a Sequence Labelling Toolkit ¶
+
+WebStruct doesn’t provide a CRF or Structured Perceptron implementation; learning and prediction is supposed to be handled by an external sequence labelling toolkit like CRFSuite, Wapiti or seqlearn.
+
+Once feature dicts are extracted from HTML you should convert them to a format required by your sequence labelling tooklit and use this toolkit to train a model and do the prediction. For example, you may use DictVectorizer from scikit-learn to convert feature dicts into seqlearn input format.
+
+We’ll use CRFSuite in this tutorial.
+
+WebStruct provides some helpers for CRFSuite sequence labelling toolkit. To use CRFSuite with WebStruct, you need
+
+sklearn-crfsuite package (which depends on python-crfsuite and sklearn)
+
+Defining a Model ¶
+
+Basic way to define CRF model is the following:
+
+model = webstruct. create_crfsuite_pipeline (token_features = [ token_identity, token_isupper, parent_tag, border_at_left ], verbose = True)
+
+First create_crfsuite_pipeline() argument is a list of feature functions which will be used for training. verbose is a boolean parameter enabling verbose output of various training information; check sklearn-crfsuite API reference for available options.
+
+Under the hood create_crfsuite_pipeline() creates a sklearn.pipeline.Pipeline with an HtmlFeatureExtractor instance followed by sklearn_crfsuite.CRF instance. The example above is just a shortcut for the following:
+
+model = Pipeline ([ ('fe', HtmlFeatureExtractor (token_features = [ token_identity, token_isupper, parent_tag, border_at_left, ])), ('crf', sklearn_crfsuite. CRF (verbose = True)), ])
+
+Training ¶
+
+To train a model use its fit method:
+
+model. fit (X, y)
+
+X and y are return values of HtmlTokenizer.tokenize() (a list of lists of HtmlToken instances and a list of lists of string IOB labels).
+
+If you use sklearn_crfsuite.CRF directly then train it using CRF.fit() method. It accepts 2 lists: a list of lists of feature dicts, and a list of lists of tags:
+
+model. fit (features, y)
+
+Named Entity Recognition ¶
+
+Once you got a trained model you can use it to extract entities from unseen (unannotated) webpages. First, get some binary HTML data:
+
+>>> import urllib2 >>> html = urllib2. urlopen ("http://scrapinghub.com/contact"). read ()
+
+Then create a NER instance initialized with a trained model:
+
+>>> ner = webstruct. NER (model)
+
+The model must provide a predict method that extracts features from HTML tokens and predicts labels for these tokens. A pipeline created with create_crfsuite_pipeline() function fits this definition.
+
+Finally, use NER.extract() method to extract entities:
+
+>>> ner. extract (html) [('Scrapinghub', 'ORG'), ..., ('Iturriaga 3429 ap. 1', 'STREET'), ...]
+
+Generally, the steps are:
+
+Load data using HtmlLoader loader. If a custom HTML cleaner was used for loading training data make sure to apply it here as well.
+Use the same html_tokenizer as used for training to extract HTML tokens from loaded trees. All labels would be “O” when using HtmlLoader loader - y can be discarded.
+Use the same feature_extractor as used for training to extract features.
+Run your_crf.predict() method (e.g. CRF.predict()) on features extracted in (3) to get the prediction - a list of IOB2-encoded tags for each input document.
+Build entities from input tokens based on predicted tags (check IobEncoder.group() and smart_join()).
+Split entities into groups (optional). One way to do it is to use webstruct.grouping.
+
+NER helper class combines HTML loading, HTML tokenization, feature extraction, CRF model, entity building and grouping.
+
+Entity Grouping ¶
+
+Detecting entities on their own is not always enough; in many cases what is wanted is to find the relationship between them. For example, “ street_name/STREET city_name/CITY zipcode_number/ZIPCODE form an address”, or “ phone/TEL is a phone of person/PER ”.
+
+The first approximation is to say that all entities from a single webpage are related. For example, if we have extracted some organizaion/ORG and some phone/TEL from a single webpage we may assume that the phone is a contact phone of the organization.
+
+Sometimes there are several “entity groups” on a webpage. If a page contains contact phones of several persons or several business locations it is better to split all entities into groups of related entities - “person name + his/her phone(s)” or “address”.
+
+WebStruct provides an unsupervised algorithm for extracting such entity groups. Algorithm prefers to build large groups without entities of duplicate types; if a split is needed algorithm tries to split at points where distance between entities is larger.
+
+Use NER.extract_groups() to extract groups of entities:
+
+>>> ner. extract_groups (html) [[...], ... [('Iturriaga 3429 ap. 1', 'STREET'), ('Montevideo', 'CITY'), ...]]
+
+Sometimes it is better to allow some entity types to appear multuple times in a group. For example, a person (PER entity) may have several contact phones and faxes (TEL and FAX entities) - we should penalize groups with multiple PERs, but multiple TELs and FAXes are fine. Use dont_penalize argument if you want to allow some entity types to appear multiple times in a group:
+
+ner. extract_groups (html, dont_penalize = { 'TEL', 'FAX' })
+
+The simple algorithm WebStruct provides is by no means a general solution to relation detection, but give it a try - maybe it is enough for your task.
+
+Model Development ¶
+
+To develop the model you need to choose the learning algorithm, features, hyperparameters, etc. To do that you need scoring metrics, cross-validation utilities and tools for debugging what classifier learned. WebStruct helps in the following way:
+
+Pipeline created by create_crfsuite_pipeline() is compatible with cross-validation and grid search utilities from scikit-learn; use them to select model parameters and check the quality.
+
+One limitation of create_crfsuite_pipeline() is that n_jobs in scikit-learn functions and classes should be 1, but other than that WebStruct objects should work fine with scikit-learn. Just keep in mind that for WebStruct an “observation” is a document, not an individual token, and a “label” is a sequence of labels for a document, not an individual IOB tag.
+
+There is webstruct.metrics module with a couple of metrics useful for sequence classification.
+
+To debug what CRFSuite learned you could use eli5 library. With eli5 it would be two calls to eli5.explain_weights() and eli5.format_as_html() with sklearn_crfsuite.CRF instance as argument. As a result you will get transitions and feature weights.
+
+Next Previous
+
+© Copyright 2014-2017, Scrapinghub Inc.. Revision 9e461566.
+
+Built with Sphinx using a theme provided by Read the Docs.
+Read the Docs v: latest
+
+Versions
+latest
+stable
+0.6
+0.5
+0.4.1
+0.4
+0.3
+0.2
+
+Downloads
+pdf
+htmlzip
+epub
+
+On Read the Docs
+Project Home
+Builds
+
+Free document hosting provided by Read the Docs.
\ No newline at end of file
diff --git "a/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html" "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html"
new file mode 100644
index 0000000..1fbcb65
--- /dev/null
+++ "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.html"	
@@ -0,0 +1,357 @@
+
+
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  <title>Webstruct &mdash; Webstruct 0.6 documentation</title>
+  
+
+  
+  
+  
+  
+
+  
+
+  
+  
+    
+
+  
+
+  <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+    <link rel="index" title="Index" href="genindex.html" />
+    <link rel="search" title="Search" href="search.html" />
+    <link rel="next" title="Webstruct" href="intro.html" /> 
+
+  
+  <script src="_static/js/modernizr.min.js"></script>
+
+
+<!-- RTD Extra Head -->
+
+<!-- 
+Always link to the latest version, as canonical.
+http://docs.readthedocs.org/en/latest/canonical.html
+-->
+<link rel="canonical" href="https://webstruct.readthedocs.io/en/latest/" />
+
+<link rel="stylesheet" href="https://media.readthedocs.org/css/readthedocs-doc-embed.css" type="text/css" />
+
+<script type="text/javascript" src="_static/readthedocs-data.js"></script>
+
+<!-- Add page-specific data, which must exist in the page js, not global -->
+<script type="text/javascript">
+READTHEDOCS_DATA['page'] = 'index'
+READTHEDOCS_DATA['source_suffix'] = '.rst'
+</script>
+
+<script type="text/javascript" src="https://media.readthedocs.org/javascript/readthedocs-analytics.js"></script>
+
+<!-- end RTD <extrahead> -->
+</head>
+
+<body class="wy-body-for-nav">
+
+   
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search">
+          
+
+          
+            <a href="#" class="icon icon-home"> Webstruct
+          
+
+          
+          </a>
+
+          
+            
+            
+            
+              <div class="version">
+                latest
+              </div>
+            
+          
+
+          
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+
+          
+        </div>
+
+        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+          
+            
+            
+              
+            
+            
+              <ul>
+<li class="toctree-l1"><a class="reference internal" href="intro.html">Webstruct</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tutorial.html">Tutorial</a></li>
+<li class="toctree-l1"><a class="reference internal" href="ref/index.html">Reference</a></li>
+<li class="toctree-l1"><a class="reference internal" href="changes.html">Changes</a></li>
+</ul>
+
+            
+          
+        </div>
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" aria-label="top navigation">
+        
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="#">Webstruct</a>
+        
+      </nav>
+
+
+      <div class="wy-nav-content">
+        
+        <div class="rst-content">
+        
+          
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+
+  <ul class="wy-breadcrumbs">
+    
+      <li><a href="#">Docs</a> &raquo;</li>
+        
+      <li>Webstruct</li>
+    
+    
+      <li class="wy-breadcrumbs-aside">
+        
+            
+            
+              <a href="https://github.com/scrapinghub/webstruct/blob/master/docs/index.rst" class="fa fa-github"> Edit on GitHub</a>
+            
+          
+        
+      </li>
+    
+  </ul>
+
+  
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            
+  <div class="section" id="webstruct">
+<h1>Webstruct<a class="headerlink" href="#webstruct" title="Permalink to this headline">¶</a></h1>
+<p>Webstruct is a library for creating statistical <a class="reference external" href="http://en.wikipedia.org/wiki/Named-entity_recognition">NER</a> systems that work
+on HTML data, i.e. a library for building tools that extract named
+entities (addresses, organization names, open hours, etc) from webpages.</p>
+<p>Contents:</p>
+<div class="toctree-wrapper compound">
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="intro.html">Webstruct</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="intro.html#overview">Overview</a></li>
+<li class="toctree-l2"><a class="reference internal" href="intro.html#installation">Installation</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="tutorial.html">Tutorial</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#get-annotated-data">Get annotated data</a></li>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#from-html-to-tokens">From HTML to Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#feature-extraction">Feature Extraction</a></li>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#using-a-sequence-labelling-toolkit">Using a Sequence Labelling Toolkit</a></li>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#named-entity-recognition">Named Entity Recognition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#entity-grouping">Entity Grouping</a></li>
+<li class="toctree-l2"><a class="reference internal" href="tutorial.html#model-development">Model Development</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="ref/index.html">Reference</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="ref/loaders.html">HTML Loaders</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/features.html">Feature Extraction</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/model.html">Model Creation Helpers</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/metrics.html">Metrics</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/grouping.html">Entity Grouping</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/wapiti.html">Wapiti Helpers</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/crfsuite.html">CRFsuite Helpers</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/webannotator.html">WebAnnotator Utilities</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/base.html">BaseSequenceClassifier</a></li>
+<li class="toctree-l2"><a class="reference internal" href="ref/misc.html">Miscellaneous</a></li>
+</ul>
+</li>
+<li class="toctree-l1"><a class="reference internal" href="changes.html">Changes</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="changes.html#id1">0.6 (2017-12-29)</a></li>
+<li class="toctree-l2"><a class="reference internal" href="changes.html#id2">0.5 (2017-05-10)</a></li>
+<li class="toctree-l2"><a class="reference internal" href="changes.html#id3">0.4.1 (2016-11-28)</a></li>
+<li class="toctree-l2"><a class="reference internal" href="changes.html#id4">0.4 (2016-11-26)</a></li>
+<li class="toctree-l2"><a class="reference internal" href="changes.html#id5">0.3 (2016-09-19)</a></li>
+</ul>
+</li>
+</ul>
+</div>
+</div>
+<div class="section" id="indices-and-tables">
+<h1>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Permalink to this headline">¶</a></h1>
+<ul class="simple">
+<li><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></li>
+<li><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></li>
+<li><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></li>
+</ul>
+</div>
+
+
+           </div>
+           
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+      
+        <a href="intro.html" class="btn btn-neutral float-right" title="Webstruct" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
+      
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>
+        &copy; Copyright 2014-2017, Scrapinghub Inc..
+      
+        <span class="commit">
+          Revision <code>9e461566</code>.
+        </span>
+      
+
+    </p>
+  </div>
+  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/rtfd/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 
+
+</footer>
+
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  
+
+  <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      <span class="fa fa-book"> Read the Docs</span>
+      v: latest
+      <span class="fa fa-caret-down"></span>
+    </span>
+    <div class="rst-other-versions">
+      <dl>
+        <dt>Versions</dt>
+        
+          <dd><a href="/en/latest/">latest</a></dd>
+        
+          <dd><a href="/en/stable/">stable</a></dd>
+        
+          <dd><a href="/en/0.6/">0.6</a></dd>
+        
+          <dd><a href="/en/0.5/">0.5</a></dd>
+        
+          <dd><a href="/en/0.4.1/">0.4.1</a></dd>
+        
+          <dd><a href="/en/0.4/">0.4</a></dd>
+        
+          <dd><a href="/en/0.3/">0.3</a></dd>
+        
+          <dd><a href="/en/0.2/">0.2</a></dd>
+        
+      </dl>
+      <dl>
+        <dt>Downloads</dt>
+        
+          <dd><a href="//readthedocs.org/projects/webstruct/downloads/pdf/latest/">pdf</a></dd>
+        
+          <dd><a href="//readthedocs.org/projects/webstruct/downloads/htmlzip/latest/">htmlzip</a></dd>
+        
+          <dd><a href="//readthedocs.org/projects/webstruct/downloads/epub/latest/">epub</a></dd>
+        
+      </dl>
+      <dl>
+        <dt>On Read the Docs</dt>
+          <dd>
+            <a href="//readthedocs.org/projects/webstruct/?fromdocs=webstruct">Project Home</a>
+          </dd>
+          <dd>
+            <a href="//readthedocs.org/builds/webstruct/?fromdocs=webstruct">Builds</a>
+          </dd>
+      </dl>
+      <hr/>
+      Free document hosting provided by <a href="http://www.readthedocs.org">Read the Docs</a>.
+
+    </div>
+  </div>
+
+
+
+  
+
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'./',
+            VERSION:'0.6',
+            LANGUAGE:'en',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'.html',
+            HAS_SOURCE:  true,
+            SOURCELINK_SUFFIX: '.txt'
+        };
+    </script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/jquery/jquery-2.0.3.min.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/jquery/jquery-migrate-1.2.1.min.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/underscore.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/doctools.js"></script>
+      <script type="text/javascript" src="https://media.readthedocs.org/javascript/readthedocs-doc-embed.js"></script>
+
+  
+
+  <script type="text/javascript" src="_static/js/theme.js"></script>
+
+  <script type="text/javascript">
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script> 
+
+</body>
+</html>
\ No newline at end of file
diff --git "a/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt" "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt"
new file mode 100644
index 0000000..6297ee9
--- /dev/null
+++ "b/tests/test_webpages/Webstruct \342\200\224 Webstruct 0.6 documentation.txt"	
@@ -0,0 +1,91 @@
+Webstruct — Webstruct 0.6 documentation
+
+Webstruct
+latest
+
+Webstruct
+Tutorial
+Reference
+Changes
+
+Webstruct
+
+Docs »
+Webstruct
+Edit on GitHub
+
+Webstruct ¶
+
+Webstruct is a library for creating statistical NER systems that work on HTML data, i.e. a library for building tools that extract named entities (addresses, organization names, open hours, etc) from webpages.
+
+Contents:
+
+Webstruct
+
+Overview
+Installation
+
+Tutorial
+
+Get annotated data
+From HTML to Tokens
+Feature Extraction
+Using a Sequence Labelling Toolkit
+Named Entity Recognition
+Entity Grouping
+Model Development
+
+Reference
+
+HTML Loaders
+Feature Extraction
+Model Creation Helpers
+Metrics
+Entity Grouping
+Wapiti Helpers
+CRFsuite Helpers
+WebAnnotator Utilities
+BaseSequenceClassifier
+Miscellaneous
+
+Changes
+
+0.6 (2017-12-29)
+0.5 (2017-05-10)
+0.4.1 (2016-11-28)
+0.4 (2016-11-26)
+0.3 (2016-09-19)
+
+Indices and tables ¶
+
+Index
+Module Index
+Search Page
+
+Next
+
+© Copyright 2014-2017, Scrapinghub Inc.. Revision 9e461566.
+
+Built with Sphinx using a theme provided by Read the Docs.
+Read the Docs v: latest
+
+Versions
+latest
+stable
+0.6
+0.5
+0.4.1
+0.4
+0.3
+0.2
+
+Downloads
+pdf
+htmlzip
+epub
+
+On Read the Docs
+Project Home
+Builds
+
+Free document hosting provided by Read the Docs.
\ No newline at end of file

From 05c77023804bdad5c8c6476af86ede466ef49bb8 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Tue, 11 Sep 2018 12:15:10 -0700
Subject: [PATCH 28/40] tests to hopefully make codecov happy

---
 tests/test_html_text.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 612668d..8b0c73d 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -31,7 +31,8 @@ def test_extract_no_text_html(all_options):
 
 
 def test_extract_text(all_options):
-    html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
+    html = (u'<html><style>.div {}</style>'
+            '<body><p>Hello,   world!</body></html>')
     assert extract_text(html, **all_options) == u'Hello, world!'
 
 
@@ -47,7 +48,8 @@ def test_empty(all_options):
 
 
 def test_extract_text_from_tree(all_options):
-    html = u'<html><style>.div {}</style><body><p>Hello,   world!</body></html>'
+    html = (u'<html><style>.div {}</style>'
+            '<body><p>Hello,   world!</body></html>')
     tree = parse_html(html)
     assert extract_text(tree, **all_options) == u'Hello, world!'
 
@@ -85,9 +87,8 @@ def test_bad_punct_whitespace():
 
 
 def test_selector(all_options):
-    html = (
-        u'<span><span id="extract-me">text<span>more</span></span>and more text</span>'
-    )
+    html = (u'<span><span id="extract-me">text<span>more</span>'
+            '</span>and more text</span>')
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text'
     subsel = sel.xpath('//span[@id="extract-me"]')[0]
@@ -117,6 +118,12 @@ def test_guess_page_layout():
             '\n\ntext_6 text_7 text_8\n\ntext_9\n\n...text_10'))
 
 
+def test_adjust_newline():
+    html = u'<div>text 1</div><p>text 2</p>'
+    assert (extract_text(html, guess_punct_space=True,
+                         guess_page_layout=True) == ('text 1\n\ntext 2'))
+
+
 def test_webpages():
     webpages = sorted(glob.glob('./test_webpages/*.html'))
     extracted = sorted(glob.glob('./test_webpages/*.txt'))

From 4505e24c577d6e071761d82f508b1fca10ec7085 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Tue, 11 Sep 2018 12:51:27 -0700
Subject: [PATCH 29/40] remove pathlib import

---
 tests/test_html_text.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 8b0c73d..db78a1e 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -2,7 +2,6 @@
 import pytest
 import lxml
 import glob
-from pathlib import Path
 
 from html_text import (extract_text, parse_html, cleaned_selector,
                        selector_to_text)

From a27e4c8ba750c4b4df1e0c7f32fbcd2c7c3e8a29 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Tue, 11 Sep 2018 12:52:31 -0700
Subject: [PATCH 30/40] fix test

---
 tests/test_html_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index db78a1e..86c32fe 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -118,7 +118,7 @@ def test_guess_page_layout():
 
 
 def test_adjust_newline():
-    html = u'<div>text 1</div><p>text 2</p>'
+    html = u'<div>text 1</div><p><div>text 2</div></p>'
     assert (extract_text(html, guess_punct_space=True,
                          guess_page_layout=True) == ('text 1\n\ntext 2'))
 

From b926c8cfd43f6813d29a17b80a554cac86d73da8 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Tue, 11 Sep 2018 18:00:21 -0700
Subject: [PATCH 31/40] remove space

---
 html_text/html_text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 9c9a20a..d391a77 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -115,7 +115,7 @@ def traverse_text_fragments(tree, prev, depth):
 
         if guess_page_layout:
             newline, prev[0] = add_newline(tree.tag, prev[0])
-            
+
         tail = ''
         if tree.tail and depth != 0:
             tail = _whitespace.sub(' ', tree.tail.strip())

From 73f49ad6fa6fc27cefad2b146859e84b8b6fa601 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 16:45:47 -0700
Subject: [PATCH 32/40] handle list of selectors

---
 html_text/html_text.py  | 25 ++++++++++++++++++-------
 tests/test_html_text.py | 10 ++++++----
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index d391a77..314ccb5 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -139,10 +139,20 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
     See html_text.extract_text docstring for description of the approach
     and options.
     """
-    return _html_to_text(
-        sel.root,
-        guess_punct_space=guess_punct_space,
-        guess_page_layout=guess_page_layout)
+    if isinstance(sel, list):
+        # if selecting a specific xpath
+        text = [
+            _html_to_text(
+                t.root,
+                guess_punct_space=guess_punct_space,
+                guess_page_layout=guess_page_layout) for t in sel
+        ]
+        return ' '.join(text)
+    else:
+        return _html_to_text(
+            sel.root,
+            guess_punct_space=guess_punct_space,
+            guess_page_layout=guess_page_layout)
 
 
 def cleaned_selector(html):
@@ -173,9 +183,10 @@ def extract_text(html,
     for punctuation. This has a slight (around 10%) performance overhead
     and is just a heuristic.
 
-    When guess_page_layout is True (default is False), a newline is added after
-    NEWLINE_TAGS and two newlines after DOUBLE_NEWLINE_TAGS. This heuristic
-    makes the extracted text more similar to how it looks like in the browser.
+    When guess_page_layout is True (default is False), a newline is added
+    before and after NEWLINE_TAGS and two newlines are added before and after
+    DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text more similar
+    to how it is rendered in the browser.
 
     NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized.
 
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index 86c32fe..c461853 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -86,12 +86,14 @@ def test_bad_punct_whitespace():
 
 
 def test_selector(all_options):
-    html = (u'<span><span id="extract-me">text<span>more</span>'
-            '</span>and more text</span>')
+    html = (u'<span><span id="extract-me">text<a>more</a>'
+            '</span>and more text <a> and some more</a></span>')
     sel = cleaned_selector(html)
-    assert selector_to_text(sel, **all_options) == 'text more and more text'
-    subsel = sel.xpath('//span[@id="extract-me"]')[0]
+    assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
+    subsel = sel.xpath('//span[@id="extract-me"]')
     assert selector_to_text(subsel, **all_options) == 'text more'
+    subsel = sel.xpath('//a')
+    assert selector_to_text(subsel, **all_options) == 'more and some more'
 
 
 def test_guess_page_layout():

From 15d22e050a4d1f5ec06d054a2fb6535c4b087d03 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 16:48:20 -0700
Subject: [PATCH 33/40] a list of selectors returns a list of texts

---
 html_text/html_text.py  | 2 +-
 tests/test_html_text.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 314ccb5..a298d4a 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -147,7 +147,7 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
                 guess_punct_space=guess_punct_space,
                 guess_page_layout=guess_page_layout) for t in sel
         ]
-        return ' '.join(text)
+        return text
     else:
         return _html_to_text(
             sel.root,
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index c461853..fbb892d 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -91,9 +91,9 @@ def test_selector(all_options):
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
     subsel = sel.xpath('//span[@id="extract-me"]')
-    assert selector_to_text(subsel, **all_options) == 'text more'
+    assert selector_to_text(subsel, **all_options) == ['text more']
     subsel = sel.xpath('//a')
-    assert selector_to_text(subsel, **all_options) == 'more and some more'
+    assert selector_to_text(subsel, **all_options) == ['more', 'and some more']
 
 
 def test_guess_page_layout():

From 8f68b2c99768fd1f9ca7b02f7a72be2901d20741 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 17:02:38 -0700
Subject: [PATCH 34/40] selectors_to_text add to res only if something is
 extracted

---
 README.rst              | 29 ++++++++++++++++++++++-------
 html_text/html_text.py  | 10 ++++++----
 tests/test_html_text.py |  4 +++-
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/README.rst b/README.rst
index cac5424..b5020b5 100644
--- a/README.rst
+++ b/README.rst
@@ -26,9 +26,10 @@ or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
 It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
-adding spaces around inline elements too
-(which are often used as block elements in html markup),
-and tries to avoid adding extra spaces for punctuation.
+adding spaces around inline elements (which are often used as block
+elements in html markup), tries to avoid adding extra spaces for punctuation and
+can add newlines so that the output text looks like how it is rendered in
+browsers.
 
 Apart from just getting text from the page (e.g. for display or search),
 one intended usage of this library is for machine learning (feature extraction).
@@ -56,18 +57,32 @@ Usage
 Extract text from HTML::
 
     >>> import html_text
-    >>> text = html_text.extract_text(u'<h1>Hey</h1>')
-    u'Hey'
+    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!')
+    u'Hello world!'
+
+    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!', guess_page_layout=True)
+    u'Hello
+    world!'
 
 You can also pass already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
-    >>> tree = html_text.parse_html(u'<h1>Hey</h1>')
+    >>> tree = html_text.parse_html(u'<h1>Hello</h1> world!')
     >>> text = html_text.extract_text(tree)
-    u'Hey'
+    u'Hello world!'
+
+Or define a selector to extract text only from specific elements, this will
+return a list of strings of text, one for each element:
+
+    >>> import html_text
+    >>> sel = html_text.cleaned_selector(u'<h1>Hello</h1> world!')
+    >>> subsel = sel.xpath('//h1')
+    >>> text = html_text.selector_to_text(subsel)
+    [u'Hello']
 
 Passed html will be first cleaned from invisible non-text content such
 as styles, and then text would be extracted.
+
 Two functions that do it are ``html_text.cleaned_selector`` and
 ``html_text.selector_to_text``:
 
diff --git a/html_text/html_text.py b/html_text/html_text.py
index a298d4a..5a3037d 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -141,12 +141,14 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
     """
     if isinstance(sel, list):
         # if selecting a specific xpath
-        text = [
-            _html_to_text(
+        text = []
+        for t in sel:
+            extracted = _html_to_text(
                 t.root,
                 guess_punct_space=guess_punct_space,
-                guess_page_layout=guess_page_layout) for t in sel
-        ]
+                guess_page_layout=guess_page_layout)
+            if extracted:
+                text.append(extracted)
         return text
     else:
         return _html_to_text(
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index fbb892d..b01329c 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -87,13 +87,15 @@ def test_bad_punct_whitespace():
 
 def test_selector(all_options):
     html = (u'<span><span id="extract-me">text<a>more</a>'
-            '</span>and more text <a> and some more</a></span>')
+            '</span>and more text <a> and some more</a> <a></a> </span>')
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
     subsel = sel.xpath('//span[@id="extract-me"]')
     assert selector_to_text(subsel, **all_options) == ['text more']
     subsel = sel.xpath('//a')
     assert selector_to_text(subsel, **all_options) == ['more', 'and some more']
+    subsel = sel.xpath('//a[@id="extract-me"]')
+    assert selector_to_text(subsel, **all_options) == []
 
 
 def test_guess_page_layout():

From cf02b940b8d5b32daa353fd73bfd485d14afd3e6 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 17:03:53 -0700
Subject: [PATCH 35/40] selectors_to_text merge results as in previous
 implementation

---
 html_text/html_text.py  | 2 +-
 tests/test_html_text.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index 5a3037d..cb64058 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -149,7 +149,7 @@ def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
                 guess_page_layout=guess_page_layout)
             if extracted:
                 text.append(extracted)
-        return text
+        return ' '.join(text)
     else:
         return _html_to_text(
             sel.root,
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index b01329c..eb8cacf 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -91,11 +91,11 @@ def test_selector(all_options):
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
     subsel = sel.xpath('//span[@id="extract-me"]')
-    assert selector_to_text(subsel, **all_options) == ['text more']
+    assert selector_to_text(subsel, **all_options) == 'text more'
     subsel = sel.xpath('//a')
-    assert selector_to_text(subsel, **all_options) == ['more', 'and some more']
+    assert selector_to_text(subsel, **all_options) == 'more and some more'
     subsel = sel.xpath('//a[@id="extract-me"]')
-    assert selector_to_text(subsel, **all_options) == []
+    assert selector_to_text(subsel, **all_options) == ''
 
 
 def test_guess_page_layout():

From 7aec8d2ae0c044996f8549d307a67c97ce71479e Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 17:11:50 -0700
Subject: [PATCH 36/40] update readme

---
 README.rst | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index b5020b5..77457cc 100644
--- a/README.rst
+++ b/README.rst
@@ -71,17 +71,18 @@ You can also pass already parsed ``lxml.html.HtmlElement``:
     >>> text = html_text.extract_text(tree)
     u'Hello world!'
 
-Or define a selector to extract text only from specific elements, this will
-return a list of strings of text, one for each element:
+Or define a selector to extract text only from specific elements:
 
     >>> import html_text
     >>> sel = html_text.cleaned_selector(u'<h1>Hello</h1> world!')
     >>> subsel = sel.xpath('//h1')
     >>> text = html_text.selector_to_text(subsel)
-    [u'Hello']
+    u'Hello'
 
 Passed html will be first cleaned from invisible non-text content such
 as styles, and then text would be extracted.
+NB Selectors are not cleaned automatically you need to call
+``html_text.cleaned_selector`` first.
 
 Two functions that do it are ``html_text.cleaned_selector`` and
 ``html_text.selector_to_text``:
@@ -90,6 +91,24 @@ Two functions that do it are ``html_text.cleaned_selector`` and
   and returns cleaned ``parsel.Selector``.
 * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns extracted
   text.
+* ``html_text.extract_text`` accepts html and returns extracted text.
+
+If ``guess_page_layout`` is True (False by default for backward compatibility),
+a newline is added before and after NEWLINE_TAGS and two newlines are added
+before and after DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text
+more similar to how it is rendered in the browser.
+NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized, here are the lists of
+the tags that are handled by default:
+
+* NEWLINE_TAGS = frozenset([
+    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
+    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
+    'nav', 'table', 'tr'
+])
+* DOUBLE_NEWLINE_TAGS = frozenset([
+    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
+    'p', 'pre', 'title', 'ul'
+])
 
 
 Credits

From 7653bf933bd47a2a39eb20ec5bf21338f9249fd1 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 17:20:44 -0700
Subject: [PATCH 37/40] update history

---
 CHANGES.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGES.rst b/CHANGES.rst
index c6aa51f..8a31755 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -2,6 +2,14 @@
 History
 =======
 
+0.4.0 TDB
+------------------
+
+* Add ``guess_page_layout`` to make extracted text look like how it is rendered
+  in browser.
+* Add tests of layout extraction for real webpages.
+
+
 0.3.0 (2017-10-12)
 ------------------
 

From 4300fe6f14659fdc8b03c335b4ff3ab9a3c02546 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Wed, 19 Sep 2018 17:24:32 -0700
Subject: [PATCH 38/40] update readme

---
 README.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.rst b/README.rst
index 77457cc..d947c80 100644
--- a/README.rst
+++ b/README.rst
@@ -84,14 +84,14 @@ as styles, and then text would be extracted.
 NB Selectors are not cleaned automatically you need to call
 ``html_text.cleaned_selector`` first.
 
-Two functions that do it are ``html_text.cleaned_selector`` and
+The main functions are ``html_text.extract_text``, ``html_text.cleaned_selector`` and
 ``html_text.selector_to_text``:
 
+* ``html_text.extract_text`` accepts html and returns extracted text.
 * ``html_text.cleaned_selector`` accepts html as text or as ``lxml.html.HtmlElement``,
   and returns cleaned ``parsel.Selector``.
 * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns extracted
   text.
-* ``html_text.extract_text`` accepts html and returns extracted text.
 
 If ``guess_page_layout`` is True (False by default for backward compatibility),
 a newline is added before and after NEWLINE_TAGS and two newlines are added

From 477206178506e77c13fe96d26167855fef5cfe30 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 20 Sep 2018 12:11:40 -0700
Subject: [PATCH 39/40] update readme and add newline personalization tests

---
 README.rst              | 40 +++++++++++++++++++---------------------
 html_text/__init__.py   |  2 +-
 tests/test_html_text.py | 20 +++++++++++++++++++-
 3 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/README.rst b/README.rst
index d947c80..9ee8b98 100644
--- a/README.rst
+++ b/README.rst
@@ -57,18 +57,27 @@ Usage
 Extract text from HTML::
 
     >>> import html_text
-    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!')
+    >>> html_text.extract_text(u'<h1>Hello</h1> world!')
     u'Hello world!'
 
-    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!', guess_page_layout=True)
-    u'Hello
-    world!'
+    >>> html_text.extract_text(u'<h1>Hello</h1> world!', guess_page_layout=True)
+    'Hello\n\nworld!'
+
+
+It is possible to add specific tags to `html_text.NEWLINE_TAGS` and
+`html_text.DOUBLE_NEWLINE_TAGS`:
+    >>> html_text.extract_text(
+        u'<a>Hello</a> world!',
+        guess_page_layout=True,
+        newline_tags=html_text.NEWLINE_TAGS | {'a'})
+    'Hello\n\nworld!'
+
 
 You can also pass already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
     >>> tree = html_text.parse_html(u'<h1>Hello</h1> world!')
-    >>> text = html_text.extract_text(tree)
+    >>> html_text.extract_text(tree)
     u'Hello world!'
 
 Or define a selector to extract text only from specific elements:
@@ -76,7 +85,7 @@ Or define a selector to extract text only from specific elements:
     >>> import html_text
     >>> sel = html_text.cleaned_selector(u'<h1>Hello</h1> world!')
     >>> subsel = sel.xpath('//h1')
-    >>> text = html_text.selector_to_text(subsel)
+    >>> html_text.selector_to_text(subsel)
     u'Hello'
 
 Passed html will be first cleaned from invisible non-text content such
@@ -94,21 +103,10 @@ The main functions are ``html_text.extract_text``, ``html_text.cleaned_selector`
   text.
 
 If ``guess_page_layout`` is True (False by default for backward compatibility),
-a newline is added before and after NEWLINE_TAGS and two newlines are added
-before and after DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text
-more similar to how it is rendered in the browser.
-NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized, here are the lists of
-the tags that are handled by default:
-
-* NEWLINE_TAGS = frozenset([
-    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
-    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
-    'nav', 'table', 'tr'
-])
-* DOUBLE_NEWLINE_TAGS = frozenset([
-    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
-    'p', 'pre', 'title', 'ul'
-])
+a newline is added before and after newline_tags and two newlines are added
+before and after double_newline_tags. This heuristic makes the extracted text
+more similar to how it is rendered in the browser. Default newline and double
+newline tags can be found in `html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`.
 
 
 Credits
diff --git a/html_text/__init__.py b/html_text/__init__.py
index 61ef192..843b010 100644
--- a/html_text/__init__.py
+++ b/html_text/__init__.py
@@ -1,4 +1,4 @@
 # -*- coding: utf-8 -*-
 
 from .html_text import (extract_text, parse_html, cleaned_selector,
-                        selector_to_text)
+                        selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
index eb8cacf..fe53f98 100644
--- a/tests/test_html_text.py
+++ b/tests/test_html_text.py
@@ -4,7 +4,7 @@
 import glob
 
 from html_text import (extract_text, parse_html, cleaned_selector,
-                       selector_to_text)
+                       selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
 
 
 @pytest.fixture(params=[{
@@ -127,6 +127,24 @@ def test_adjust_newline():
                          guess_page_layout=True) == ('text 1\n\ntext 2'))
 
 
+def test_personalize_newlines_sets():
+    html = (u'<span><span>text<a>more</a>'
+            '</span>and more text <a> and some more</a> <a></a> </span>')
+    assert (extract_text(
+        html,
+        guess_punct_space=True,
+        guess_page_layout=True,
+        newline_tags=NEWLINE_TAGS | {'a'}
+        ) == 'text\nmore\nand more text\nand some more')
+
+    assert (extract_text(
+        html,
+        guess_punct_space=True,
+        guess_page_layout=True,
+        double_newline_tags=DOUBLE_NEWLINE_TAGS | {'a'}
+        ) == 'text\n\nmore\n\nand more text\n\nand some more')
+
+
 def test_webpages():
     webpages = sorted(glob.glob('./test_webpages/*.html'))
     extracted = sorted(glob.glob('./test_webpages/*.txt'))

From 05b979a6f43fa27d10eb29df1cdfae14b566f235 Mon Sep 17 00:00:00 2001
From: Ludovica Gonella <ludovica.gonella@gmail.com>
Date: Thu, 20 Sep 2018 14:41:30 -0700
Subject: [PATCH 40/40] change documentation

---
 html_text/html_text.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/html_text/html_text.py b/html_text/html_text.py
index cb64058..ecb7431 100644
--- a/html_text/html_text.py
+++ b/html_text/html_text.py
@@ -186,11 +186,12 @@ def extract_text(html,
     and is just a heuristic.
 
     When guess_page_layout is True (default is False), a newline is added
-    before and after NEWLINE_TAGS and two newlines are added before and after
-    DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text more similar
+    before and after newline_tags and two newlines are added before and after
+    double_newline_tags. This heuristic makes the extracted text more similar
     to how it is rendered in the browser.
 
-    NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized.
+    NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be extended, check readme for
+    an example on how to do it.
 
     html should be a unicode string or an already parsed lxml.html element.
     """