Skip to content

Commit

Permalink
Add missing article_body_converters
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Dec 17, 2021
1 parent 782bfdb commit b978063
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
26 changes: 26 additions & 0 deletions html2tei/article_body_converters/justext_abc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

from bs4 import BeautifulSoup
from justext import get_stoplist, justext

from html2tei.tei_utils import tei_defaultdict, create_new_tag_with_string

stoplist = get_stoplist('Hungarian')


def process_article(one_page_of_article_things, body_log, get_meta_fun, spec_body_params):
"""Using the JusText boilerplate removal tool to extract the article's paragraphs
Returns the metadata dictionary and paragraphs"""
url, warc_response_datetime, warc_id, html = one_page_of_article_things
_ = url, get_meta_fun, spec_body_params # Silence IDE
justasoup = BeautifulSoup(features='lxml')
paragraphs = justext(html, stoplist)
metas_in_dict = tei_defaultdict()
metas_in_dict['sch:url'] = url
justext_paragraphs = [create_new_tag_with_string(justasoup, paragraph.text, 'p') for paragraph in paragraphs
if not paragraph.is_boilerplate]
if len(justext_paragraphs) == 0: # Justext did not find any relevant (not boilerplate) text in the article
body_log.log('WARNING', f'JusText did not find any relevant (not boilerplate) text in the article: {url}')
justext_paragraphs = [create_new_tag_with_string(justasoup, '', 'p')]
return metas_in_dict, justext_paragraphs
36 changes: 36 additions & 0 deletions html2tei/article_body_converters/newspaper3k_abc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

from bs4 import BeautifulSoup
from newspaper import Article

from html2tei.tei_utils import tei_defaultdict, create_new_tag_with_string


def process_article(one_page_of_article_things, body_log, get_meta_fun, spec_body_params):
"""Using the Newspaper3k tool to extract the metadata and paragraphs from the article
Returns the metadata dictionary and paragraphs"""
_ = body_log, get_meta_fun, spec_body_params # Silence IDE
url, warc_response_datetime, warc_id, html = one_page_of_article_things
n3ksoup = BeautifulSoup(features='lxml')
metas_in_dict = tei_defaultdict()
metas_in_dict['sch:url'] = url
a = Article(url, language='hu')
a.download(input_html=html)
a.parse()
n3k_paragraphs = [create_new_tag_with_string(n3ksoup, p_text, 'p') for p_text in a.text.split('\n')
if len(p_text.strip()) > 0]
if len(n3k_paragraphs) == 0:
body_log.log('WARNING',
f'Newspaper3k did not find any relevant (not boilerplate) text in the article: {url}')
n3k_paragraphs = [create_new_tag_with_string(n3ksoup, '', 'p')]

if a.publish_date is not None:
metas_in_dict['sch:datePublished'] = a.publish_date.replace(tzinfo=None)
if a.title is not None:
metas_in_dict['sch:name'] = a.title
if len(a.authors) > 0:
metas_in_dict['sch:author'] = a.authors
if len(a.tags) > 0:
metas_in_dict['sch:keywords'] = list(a.tags)
return metas_in_dict, n3k_paragraphs

0 comments on commit b978063

Please sign in to comment.