Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow extracting alternative text #20

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions html_text/html_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def etree_to_text(tree,
guess_punct_space=True,
guess_layout=True,
newline_tags=NEWLINE_TAGS,
double_newline_tags=DOUBLE_NEWLINE_TAGS):
double_newline_tags=DOUBLE_NEWLINE_TAGS,
extract_alt_text=False):
"""
Convert a html tree to text. Tree should be cleaned with
``html_text.html_text.cleaner.clean_html`` before passing to this
Expand Down Expand Up @@ -135,6 +136,10 @@ def add_text(text_content, context):
def traverse_text_fragments(tree, context, handle_tail=True):
""" Extract text from the ``tree``: fill ``chunks`` variable """
add_newlines(tree.tag, context)
if extract_alt_text is True:
alt_text = tree.attrib.get('alt')
if alt_text is not None:
add_text(alt_text, context)
add_text(tree.text, context)
for child in tree:
traverse_text_fragments(child, context)
Expand Down Expand Up @@ -190,7 +195,8 @@ def extract_text(html,
guess_punct_space=True,
guess_layout=True,
newline_tags=NEWLINE_TAGS,
double_newline_tags=DOUBLE_NEWLINE_TAGS):
double_newline_tags=DOUBLE_NEWLINE_TAGS,
extract_alt_text=False):
"""
Convert html to text, cleaning invisible content such as styles.

Expand All @@ -215,6 +221,10 @@ def extract_text(html,

Default newline and double newline tags can be found in
`html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`.

By default, alternative text associated to ``img``, ``area`` and ``input``
tags is ignored. Set *extract_alt_text* to ``True`` to extract that text
as well.
"""
if html is None:
return ''
Expand All @@ -225,4 +235,5 @@ def extract_text(html,
guess_layout=guess_layout,
newline_tags=newline_tags,
double_newline_tags=double_newline_tags,
extract_alt_text=extract_alt_text,
)
31 changes: 31 additions & 0 deletions tests/test_html_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,34 @@ def test_webpages(page, extracted):

tree = cleaner.clean_html(parse_html(html))
assert etree_to_text(tree) == expected


def test_alt_text_disabled():
html = (u'<img alt="In the sky flies a red flag with a white cross whose '
u'vertical bar is shifted toward the flagpole."'
u'src="http://upload.a.org/wikipedia/commons/thumb/8/83'
u'/Dannebrog.jpg/180px-Dannebrog.jpg">')
text = extract_text(html)
assert text == u''


def test_alt_text_enabled():
html = (u'<img alt="In the sky flies a red flag with a white cross whose '
u'vertical bar is shifted toward the flagpole."'
u'src="http://upload.a.org/wikipedia/commons/thumb/8/83'
u'/Dannebrog.jpg/180px-Dannebrog.jpg">')
text = extract_text(html, extract_alt_text=True)
assert text == (u'In the sky flies a red flag with a white cross whose '
u'vertical bar is shifted toward the flagpole.')


def test_alt_text_between_paragraphs_disabled():
html = (u'<p>1</p><img alt="2" /><p>3</p>')
text = extract_text(html)
assert text == u'1\n\n3'


def test_alt_text_between_paragraphs_enabled():
html = (u'<p>1</p><img alt="2" /><p>3</p>')
text = extract_text(html, extract_alt_text=True)
assert text == u'1\n\n2\n\n3'