Skip to content

Commit

Permalink
Merge pull request #82 from alan-turing-institute/deal-with-nested-bl…
Browse files Browse the repository at this point in the history
…ocks

Deal with nested blocks
  • Loading branch information
jemrobinson authored Aug 5, 2019
2 parents 06f2046 + fe38bda commit fd59b60
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 20 deletions.
5 changes: 4 additions & 1 deletion readabilipy/simple_tree.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Turn input HTML into a cleaned parsed tree."""
from bs4 import BeautifulSoup
from .simplifiers.html import consolidate_text, insert_paragraph_breaks, normalise_strings, process_special_elements, process_unknown_elements, recursively_prune_elements, remove_blacklist, remove_empty_strings_and_elements, remove_metadata, strip_attributes, structural_elements, unwrap_elements, wrap_bare_text
from .simplifiers.html import consolidate_text, insert_paragraph_breaks, normalise_strings, process_special_elements, process_unknown_elements, recursively_prune_elements, remove_blacklist, remove_empty_strings_and_elements, remove_metadata, strip_attributes, structural_elements, unnest_paragraphs, unwrap_elements, wrap_bare_text


def simple_tree_from_html_string(html):
Expand Down Expand Up @@ -36,6 +36,9 @@ def simple_tree_from_html_string(html):
# Remove empty string elements
remove_empty_strings_and_elements(soup)

# Split out block-level elements illegally contained inside paragraphs
unnest_paragraphs(soup)

# Replace <br> and <hr> elements with paragraph breaks
# Must come after remove_empty_strings_and_elements so that consecutive <br>s can be identified
# Re-consolidates strings at the end, so must come before normalise_strings
Expand Down
36 changes: 30 additions & 6 deletions readabilipy/simplifiers/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,33 @@ def remove_empty_strings_and_elements(soup):
element.extract()


def unnest_paragraphs(soup):
"""Split out block-level elements illegally contained inside paragraphs."""
illegal_elements = ["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset",
"figcaption", "figure", "footer", "form", "h1>-<h6", "header", "hr", "li", "main", "nav",
"noscript", "ol", "p", "pre", "section", "table", "tfoot", "ul", "video"]
for nested_type in illegal_elements:
# Search for nested elements that need to be split out
nested_elements = [e for e in soup.find_all('p') if e.find(nested_type)]
while nested_elements:
# Separate this element into the nested element, plus before and after
elem_nested = nested_elements[0].find(nested_type)
p_before = soup.new_tag("p")
for sibling in list(elem_nested.previous_siblings):
p_before.append(sibling)
p_after = soup.new_tag("p")
for sibling in list(elem_nested.next_siblings):
p_after.append(sibling)
# Replace element by before/nested/after.
# NB. this is done in reverse order as we are adding after the current position
nested_elements[0].insert_after(p_after)
nested_elements[0].insert_after(elem_nested)
nested_elements[0].insert_after(p_before)
nested_elements[0].decompose()
# Rerun search for nested elements now that we have rewritten the tree
nested_elements = [e for e in soup.find_all('p') if e.find(nested_type)]


def insert_paragraph_breaks(soup):
"""Identify <br> and <hr> and split their parent element into multiple elements where appropriate."""
# Indicator which is used as a placeholder to mark paragraph breaks
Expand All @@ -167,7 +194,7 @@ def insert_paragraph_breaks(soup):

# If there's only one <br> then we replace it with a space
if len(br_element_chain) == 1:
br_element_chain[0].replace_with(" ")
br_element_chain[0].replace_with(' ')
# If there are multiple <br>s then replace them with BREAK_INDICATOR
else:
br_element_chain[0].replace_with(BREAK_INDICATOR)
Expand Down Expand Up @@ -199,11 +226,8 @@ def insert_paragraph_breaks(soup):
new_p_element = soup.new_tag("p")
new_p_element.string = text_fragment
parent_element.insert_after(new_p_element)
# Replace the parent string if it exists or add one if not
if parent_element.string:
parent_element.string.replace_with(text_fragments[0])
else:
parent_element.string = text_fragments[0]
# Replace this element by a navigable string containing the first text fragment
element.replace_with(NavigableString(text_fragments[0]))
# Otherwise we want to simply include all the text fragments as independent NavigableStrings (that will be wrapped later)
else:
# Iterate in reverse order as we are repeatedly adding new elements directly after the original one
Expand Down
18 changes: 9 additions & 9 deletions tests/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from ..readabilipy.simple_json import extract_text_blocks_as_plain_text


def check_exact_html_output(test_fragment, expected_output=None):
"""Check that expected output is present when parsing HTML fragment."""
def get_normalised_html_output(test_fragment, expected_output=None):
"""Get normalised HTML output."""
if expected_output is None:
expected_output = test_fragment
article_json = simple_json_from_html_string(test_fragment)
Expand All @@ -16,18 +16,18 @@ def check_exact_html_output(test_fragment, expected_output=None):
normalised_result = strip_html_whitespace(content)
print("expectation:", normalised_expectation)
print("result:", normalised_result)
return (normalised_expectation, normalised_result)


def check_exact_html_output(test_fragment, expected_output=None):
"""Check that expected output is present when parsing HTML fragment."""
normalised_expectation, normalised_result = get_normalised_html_output(test_fragment, expected_output)
assert normalised_expectation == normalised_result


def check_html_output_contains_text(test_fragment, expected_output=None):
"""Check that expected output is present when parsing HTML fragment."""
if expected_output is None:
expected_output = test_fragment
article_json = simple_json_from_html_string(test_fragment)
content = str(article_json["plain_content"])
# Check that expected output is present after simplifying the HTML
normalised_expectation = strip_html_whitespace(expected_output)
normalised_result = strip_html_whitespace(content)
normalised_expectation, normalised_result = get_normalised_html_output(test_fragment, expected_output)
assert normalised_expectation in normalised_result


Expand Down
34 changes: 30 additions & 4 deletions tests/test_weird_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_paragraph_splitting_with_unclosed_tags():
)


# Test nested superscript
# Test (possibly illegal) nested elements
def test_nested_superscript():
"""Ensure that nested superscripts are correctly parsed."""
check_exact_html_output(
Expand All @@ -76,16 +76,14 @@ def test_nested_superscript():
)


# Test linebreaks inside superscript
def test_linebreaks_inside_superscript():
def test_nested_linebreaks_inside_superscript():
"""Ensure that linebreaks inside superscript are correctly parsed."""
check_exact_html_output(
"<p>Some text <sup>with<br/>superscripts</sup> that should be joined.</p>",
"<div><p>Some text ^with superscripts that should be joined.</p></div>"
)


# Test nested superscript with linebreaks
def test_nested_superscript_with_linebreaks():
"""Ensure that nested superscripts with linebreaks are correctly parsed."""
check_exact_html_output(
Expand All @@ -96,3 +94,31 @@ def test_nested_superscript_with_linebreaks():
</p>""",
"<div><p>Some text with linebreaks ^ ^around a footnote.</p></div>"
)


def test_nested_table_inside_paragraph():
"""Ensure that blocks (illegally) nested inside paragraphs are split out."""
check_exact_html_output(
"""
<p>
First paragraph.
<br/><br/>
<table>
<tbody>
<tr>
<td>Table text.</td>
</tr>
</tbody>
</table>
Second paragraph.
</p>""",
"<div><p>First paragraph.</p><table><tbody><tr><td>Table text.</td></tr></tbody></table><p>Second paragraph.</p></div>"
)


def test_nested_span_inside_paragraph():
"""Ensure that spans nested inside paragraphs are kept in."""
check_exact_html_output(
"<p>Some text <span>in a span</span> that should stay together.</p>""",
"<div><p>Some text in a span that should stay together.</p></div>"
)

0 comments on commit fd59b60

Please sign in to comment.