Skip to content

Commit

Permalink
Merge pull request #36 from elifesciences/develop
Browse files Browse the repository at this point in the history
PR for release of version 0.10.0
  • Loading branch information
gnott authored Mar 1, 2022
2 parents 886be5e + 5060f99 commit 478177b
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 101 deletions.
2 changes: 1 addition & 1 deletion elifecleaner/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging


__version__ = "0.9.1"
__version__ = "0.10.0"


LOGGER = logging.getLogger(__name__)
Expand Down
85 changes: 56 additions & 29 deletions elifecleaner/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,59 +18,48 @@ def check_ejp_zip(zip_file, tmp_dir):
xml_asset = article_xml_asset(asset_file_name_map)
root = parse_article_xml(xml_asset[1])
files = file_list(root)
# use the zip file name as the identifier for log messages
identifer = zip_file.split(os.sep)[-1]
return check_files(files, asset_file_name_map, identifer)


def check_files(files, asset_file_name_map, identifier):
figures = figure_list(files, asset_file_name_map)
zip_file_name = zip_file.split(os.sep)[-1]
figures = set_figure_pdf_pages_count(figures)
# check for multiple page PDF figures
check_multi_page_figure_pdf(figures, zip_file_name)
check_multi_page_figure_pdf(figures, identifier)
# check for missing files
missing_files = find_missing_files(files, asset_file_name_map)
for missing_file in missing_files:
LOGGER.warning(
"%s does not contain a file in the manifest: %s",
zip_file_name,
missing_file,
)
check_missing_files(files, asset_file_name_map, identifier)
# check for file not listed in the manifest
extra_files = find_extra_files(files, asset_file_name_map)
for extra_file in extra_files:
LOGGER.warning(
"%s has file not listed in the manifest: %s", zip_file_name, extra_file
)
extra_files = check_extra_files(files, asset_file_name_map, identifier)
# check for out of sequence files by name
missing_files_by_name = find_missing_files_by_name(files)
for missing_file in missing_files_by_name:
LOGGER.warning(
"%s has file misisng from expected numeric sequence: %s",
zip_file_name,
missing_file,
)

check_missing_files_by_name(files, identifier)
return True


def check_multi_page_figure_pdf(figures, zip_file_name):
def check_multi_page_figure_pdf(figures, identifier):
pdfimages_available = pdf_utils.pdfimages_exists()
for pdf in [pdf for pdf in figures if pdf.get("pages") and pdf.get("pages") > 1]:
is_multi_page = False
if pdfimages_available:
LOGGER.info(
"%s using pdfimages to check PDF figure file: %s",
zip_file_name,
identifier,
pdf.get("file_name"),
)
try:
image_pages = pdf_utils.pdf_image_pages(pdf.get("file_path"))
LOGGER.info(
"%s pdfimages found images on pages %s in PDF figure file: %s",
zip_file_name,
identifier,
image_pages,
pdf.get("file_name"),
)
is_multi_page = bool([page for page in image_pages if page > 1])
except:
LOGGER.exception(
"%s exception using pdfimages to check PDF figure file: %s",
zip_file_name,
identifier,
pdf.get("file_name"),
)
# consider it multi page in the case pdfimages raises an exception
Expand All @@ -80,11 +69,22 @@ def check_multi_page_figure_pdf(figures, zip_file_name):
if is_multi_page:
LOGGER.warning(
"%s multiple page PDF figure file: %s",
zip_file_name,
identifier,
pdf.get("file_name"),
)


def check_missing_files(files, asset_file_name_map, identifier):
"check for missing files and log a warning if missing"
missing_files = find_missing_files(files, asset_file_name_map)
for missing_file in missing_files:
LOGGER.warning(
"%s does not contain a file in the manifest: %s",
identifier,
missing_file,
)


def find_missing_files(files, asset_file_name_map):
"for each file name from the manifest XML file, check for missing files in the zip contents"
missing_files = []
Expand All @@ -97,6 +97,15 @@ def find_missing_files(files, asset_file_name_map):
return missing_files


def check_extra_files(files, asset_file_name_map, identifier):
"check for extra files and log them as a warning if present"
extra_files = find_extra_files(files, asset_file_name_map)
for extra_file in extra_files:
LOGGER.warning(
"%s has file not listed in the manifest: %s", identifier, extra_file
)


def find_extra_files(files, asset_file_name_map):
"check if any file names are missing from the manifest XML"
extra_files = []
Expand Down Expand Up @@ -125,6 +134,17 @@ def find_extra_files(files, asset_file_name_map):
return extra_files


def check_missing_files_by_name(files, identifier):
"check for files numbered out of sequence and log a warning when found"
missing_files_by_name = find_missing_files_by_name(files)
for missing_file in missing_files_by_name:
LOGGER.warning(
"%s has file misisng from expected numeric sequence: %s",
identifier,
missing_file,
)


def find_missing_files_by_name(files):
"""
In the manifest file names look for any missing from the expected numeric sequence
Expand Down Expand Up @@ -313,6 +333,7 @@ def file_list(root):


def figure_list(files, asset_file_name_map):
"identify which files are a figure and collect some data about them"
figures = []

figure_files = [
Expand All @@ -329,12 +350,18 @@ def figure_list(files, asset_file_name_map):
figure_detail["file_name"] = asset_file_name[0]
figure_detail["file_path"] = asset_file_name[1]
break
if figure_detail["extension"] == "pdf":
figure_detail["pages"] = pdf_page_count(figure_detail.get("file_path"))
figures.append(figure_detail)
return figures


def set_figure_pdf_pages_count(figure_assets):
"for the pdf files count the number of pages and set the property"
for figure_detail in figure_assets:
if figure_detail["extension"] == "pdf":
figure_detail["pages"] = pdf_page_count(figure_detail.get("file_path"))
return figure_assets


def file_extension(file_name):
return file_name.split(".")[-1].lower() if file_name and "." in file_name else None

Expand Down
Loading

0 comments on commit 478177b

Please sign in to comment.