Skip to content

Commit

Permalink
Fix logging when run parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Jun 3, 2021
1 parent 9794ac1 commit a921cc0
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 8 deletions.
2 changes: 1 addition & 1 deletion html2tei/portal_article_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def init_portal(log_dir, output_dir, run_params, portal_name, tei_logger, warc_l
# - the portal-specific base TEI XML in string format
# - the portal-specific get_meta function
# - the write-out mode (e.g. Custom Article Body Converter, JusText, Newspaper3k)
process_article_clean_params = (tei_logger, portal_xml_string, get_meta_fun_spec, write_out_mode)
process_article_clean_params = [tei_logger, portal_xml_string, get_meta_fun_spec, write_out_mode] # Must be list!
# Params for write_mode from the loaded portal-specific configuration
# - article root params for find_all
# - portal-specific decompose functions
Expand Down
16 changes: 11 additions & 5 deletions html2tei/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# -*- coding: utf-8, vim: expandtab:ts=4 -*-


from multiprocessing import Pool
from multiprocessing import Pool, Manager
from contextlib import contextmanager
from os.path import isdir as os_path_isdir
from threading import Lock as threading_Lock
Expand Down Expand Up @@ -106,10 +106,16 @@ def run_multiple_process(warc_filename, file_names_and_modes, main_function, sub
(multi-page articles are handled as one entry) and yield the result after filtered through after_function
"""
# This is parallel as it computes each page separately. Order preserved!
with open_multiple_files(file_names_and_modes) as fhandles, Pool() as p:
queue = p.imap(main_function, aggregated_multipage_articles_gen(warc_filename, sub_functions), chunksize=1000)
for ret in queue: # This is single process because it writes to files
yield after_function(ret, after_params, fhandles)
with Manager() as man:
log_queue = man.Queue()
logger_obj = sub_functions[0][0]
with logger_obj.init_mp_logging_context(log_queue) as mp_logger, \
open_multiple_files(file_names_and_modes) as fhandles, Pool() as p:
sub_functions[0][0] = mp_logger
queue = p.imap(main_function, aggregated_multipage_articles_gen(warc_filename, sub_functions),
chunksize=1000)
for ret in queue: # This is single process because it writes to files
yield after_function(ret, after_params, fhandles)


# This function is used outside of this file
Expand Down
2 changes: 1 addition & 1 deletion html2tei/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

__version__ = '1.0.0'
__version__ = '1.0.1'
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def import_pyhton_file(module_name, file_path):
python_requires='>=3.6',
install_requires=['beautifulsoup4>=4.9.0,<5.0.0', 'justext>=2.2.0,<3.0.0', 'lxml>=4.5.0,<5.0.0',
'newspaper3k>=0.2.8,<1.0.0', 'pyyaml>=5.3.0,<6.0.0', 'warcio>=1.7.0,<2.0.0',
'webarticlecurator>=1.2.0,<2.0.0'],
'webarticlecurator>=1.4.0,<2.0.0'],
include_package_data=True,
entry_points={
'console_scripts': [
Expand Down

0 comments on commit a921cc0

Please sign in to comment.