-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from TeamHG-Memex/inline-tags-spaces
Fix unwanted joins for inline tags
- Loading branch information
Showing
3 changed files
with
73 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,49 @@ | ||
# -*- coding: utf-8 -*- | ||
import pytest | ||
|
||
from html_text import extract_text, parse_html | ||
|
||
|
||
def test_extract_text(): | ||
@pytest.fixture(params=[{'guess_punct_space': True}, | ||
{'guess_punct_space': False}]) | ||
def all_options(request): | ||
return request.param | ||
|
||
|
||
def test_extract_text(all_options): | ||
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' | ||
assert extract_text(html) == u'Hello, world!' | ||
assert extract_text(html, **all_options) == u'Hello, world!' | ||
|
||
|
||
def test_declared_encoding(): | ||
def test_declared_encoding(all_options): | ||
html = (u'<?xml version="1.0" encoding="utf-8" ?>' | ||
u'<html><style>.div {}</style>' | ||
u'<body>Hello, world!</p></body></html>') | ||
assert extract_text(html) == u'Hello, world!' | ||
assert extract_text(html, **all_options) == u'Hello, world!' | ||
|
||
|
||
def test_empty(): | ||
assert extract_text(u'') == '' | ||
def test_empty(all_options): | ||
assert extract_text(u'', **all_options) == '' | ||
|
||
|
||
def test_extract_text_from_tree(): | ||
def test_extract_text_from_tree(all_options): | ||
html = u'<html><style>.div {}</style><body><p>Hello, world!</body></html>' | ||
tree = parse_html(html) | ||
assert extract_text(tree) == u'Hello, world!' | ||
assert extract_text(tree, **all_options) == u'Hello, world!' | ||
|
||
|
||
def test_inline_tags_whitespace(all_options): | ||
html = u'<span>field</span><span>value of</span><span></span>' | ||
assert extract_text(html, **all_options) == u'field value of' | ||
|
||
|
||
def test_punct_whitespace(): | ||
html = u'<div><span>field</span>, and more</div>' | ||
assert extract_text(html, guess_punct_space=False) == u'field , and more' | ||
|
||
|
||
def test_punct_whitespace_preserved(): | ||
html = (u'<div><span>по</span><span>ле</span>, and , ' | ||
u'<span>more </span>!<span>now</div>a (<b>boo</b>)') | ||
assert (extract_text(html, guess_punct_space=True) == | ||
u'по ле, and , more ! now a (boo)') |