Add missing article_body_converters

ELTE-DH · Dec 17, 2021 · b978063 · b978063
1 parent 782bfdb
commit b978063
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 0 deletions.
diff --git a/html2tei/article_body_converters/justext_abc.py b/html2tei/article_body_converters/justext_abc.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8, vim: expandtab:ts=4 -*-
+
+from bs4 import BeautifulSoup
+from justext import get_stoplist, justext
+
+from html2tei.tei_utils import tei_defaultdict, create_new_tag_with_string
+
+stoplist = get_stoplist('Hungarian')
+
+
+def process_article(one_page_of_article_things, body_log, get_meta_fun, spec_body_params):
+    """Using the JusText boilerplate removal tool to extract the article's paragraphs
+        Returns the metadata dictionary and paragraphs"""
+    url, warc_response_datetime, warc_id, html = one_page_of_article_things
+    _ = url, get_meta_fun, spec_body_params  # Silence IDE
+    justasoup = BeautifulSoup(features='lxml')
+    paragraphs = justext(html, stoplist)
+    metas_in_dict = tei_defaultdict()
+    metas_in_dict['sch:url'] = url
+    justext_paragraphs = [create_new_tag_with_string(justasoup, paragraph.text, 'p') for paragraph in paragraphs
+                          if not paragraph.is_boilerplate]
+    if len(justext_paragraphs) == 0:  # Justext did not find any relevant (not boilerplate) text in the article
+        body_log.log('WARNING', f'JusText did not find any relevant (not boilerplate) text in the article: {url}')
+        justext_paragraphs = [create_new_tag_with_string(justasoup, '', 'p')]
+    return metas_in_dict, justext_paragraphs
diff --git a/html2tei/article_body_converters/newspaper3k_abc.py b/html2tei/article_body_converters/newspaper3k_abc.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8, vim: expandtab:ts=4 -*-
+
+from bs4 import BeautifulSoup
+from newspaper import Article
+
+from html2tei.tei_utils import tei_defaultdict, create_new_tag_with_string
+
+
+def process_article(one_page_of_article_things, body_log, get_meta_fun, spec_body_params):
+    """Using the Newspaper3k tool to extract the metadata and paragraphs from the article
+        Returns the metadata dictionary and paragraphs"""
+    _ = body_log, get_meta_fun, spec_body_params  # Silence IDE
+    url, warc_response_datetime, warc_id, html = one_page_of_article_things
+    n3ksoup = BeautifulSoup(features='lxml')
+    metas_in_dict = tei_defaultdict()
+    metas_in_dict['sch:url'] = url
+    a = Article(url, language='hu')
+    a.download(input_html=html)
+    a.parse()
+    n3k_paragraphs = [create_new_tag_with_string(n3ksoup, p_text, 'p') for p_text in a.text.split('\n')
+                      if len(p_text.strip()) > 0]
+    if len(n3k_paragraphs) == 0:
+        body_log.log('WARNING',
+                     f'Newspaper3k did not find any relevant (not boilerplate) text in the article: {url}')
+        n3k_paragraphs = [create_new_tag_with_string(n3ksoup, '', 'p')]
+
+    if a.publish_date is not None:
+        metas_in_dict['sch:datePublished'] = a.publish_date.replace(tzinfo=None)
+    if a.title is not None:
+        metas_in_dict['sch:name'] = a.title
+    if len(a.authors) > 0:
+        metas_in_dict['sch:author'] = a.authors
+    if len(a.tags) > 0:
+        metas_in_dict['sch:keywords'] = list(a.tags)
+    return metas_in_dict, n3k_paragraphs