Skip to content

Commit

Permalink
Merge pull request #14 from TeamHG-Memex/fix-webpage-tests
Browse files Browse the repository at this point in the history
Fix webpage tests
  • Loading branch information
kmike committed Nov 17, 2018
2 parents 8696f80 + 6244867 commit b5cd26a
Showing 1 changed file with 36 additions and 10 deletions.
46 changes: 36 additions & 10 deletions tests/test_html_text.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# -*- coding: utf-8 -*-
import pytest
import glob
import os

import six
import pytest

from html_text import (extract_text, parse_html, cleaned_selector,
selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)


ROOT = os.path.dirname(os.path.abspath(__file__))


@pytest.fixture(params=[
{'guess_punct_space': True, 'guess_layout': False},
{'guess_punct_space': False, 'guess_layout': False},
Expand Down Expand Up @@ -107,6 +113,14 @@ def test_selectors(all_options):
assert selector_to_text(subsel, **all_options) == ''


def test_nbsp():
if six.PY2:
raise pytest.xfail("  produces '\xa0' in Python 2, "
"but ' ' in Python 3")
html = "<h1>Foo&nbsp;Bar</h1>"
assert extract_text(html) == "Foo Bar"


def test_guess_layout():
html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>'
'<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
Expand Down Expand Up @@ -149,12 +163,24 @@ def test_personalize_newlines_sets():
assert text == 'text\n\nmore\n\nand more text\n\nand some more'


def test_webpages():
webpages = sorted(glob.glob('./test_webpages/*.html'))
extracted = sorted(glob.glob('./test_webpages/*.txt'))
for page, extr in zip(webpages, extracted):
with open(page, 'r', encoding='utf8') as f_in:
html = f_in.read()
with open(extr, 'r', encoding='utf8') as f_in:
expected = f_in.read()
assert extract_text(html) == expected
def _webpage_paths():
webpages = sorted(glob.glob(os.path.join(ROOT, 'test_webpages', '*.html')))
extracted = sorted(glob.glob(os.path.join(ROOT, 'test_webpages','*.txt')))
return list(zip(webpages, extracted))


def _load_file(path):
with open(path, 'rb') as f:
return f.read().decode('utf8')


@pytest.mark.parametrize(['page', 'extracted'], _webpage_paths())
def test_webpages(page, extracted):
html = _load_file(page)
if not six.PY3:
# FIXME: &nbsp; produces '\xa0' in Python 2, but ' ' in Python 3
# this difference is ignored in this test.
# What is the correct behavior?
html = html.replace('&nbsp;', ' ')
expected = _load_file(extracted)
assert extract_text(html) == expected

0 comments on commit b5cd26a

Please sign in to comment.