Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ python -m mapping_cli

**1. SEM Mapping**

Use the `sem` subcommand for SEM mapping. The mapper expects a map file, an image or image metadata file, and a JSON output path:
Use the `sem` subcommand for SEM mapping. The mapper expects a map file, an image, an image metadata file or a ZIP archive containing multiple such files, and an output path with a format depending on the input type (for a single input, the mapper expects a JSON output path, while for a ZIP input, it expects a ZIP output path):
```
python -m mapping_cli sem -m <map_file> -i <image or metadata file> -o <json_output_path>
python -m mapping_cli sem -m <map_file> -i <image or metadata single file or zip_file> -o <json/zip_output_path>
```

It is worth mentioning that the ZIP output file will only contain the successful mapping files in JSON format.

For further information about the necessary map file, see [Mapping README](./src/resources/maps/mapping)

**2. Tomography Mapping**
Expand Down
81 changes: 69 additions & 12 deletions mapping_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import logging
import os
from sys import exit
from pathlib import Path

from src.IO.MappingAbortionError import MappingAbortionError
from src.IO.sem.InputReader import InputReader as InputReader_SEM
from src.IO.tomo.InputReader import InputReader as InputReader_TOMO
from src.IO.tomo.OutputWriter import OutputWriter
from src.IO.sem.OutputWriter import OutputWriter as OutputWriter_SEM
from src.IO.tomo.OutputWriter import OutputWriter as OutputWriter_TOMO
from src.resources.maps.parsing import map_from_flag

# make log level configurable from ENV, defaults to info level
Expand Down Expand Up @@ -87,8 +89,8 @@ def run_tomo_mapper(args):
#si = setup_infos if len(setup_infos) >= 1 else None
#ri = run_infos if len(run_infos) >= 1 else None

output = OutputWriter.stitch_together(setup_infos, run_infos, imgs)
OutputWriter.writeOutput(output, OUTPUT_PATH)
output = OutputWriter_TOMO.stitch_together(setup_infos, run_infos, imgs)
OutputWriter_TOMO.writeOutput(output, OUTPUT_PATH)
except MappingAbortionError as e:
reader.clean_up()
exit(e)
Expand All @@ -103,18 +105,73 @@ def run_sem_mapper(args):
MAP_SOURCE = argdict.get('map')
OUTPUT_PATH = argdict.get('output')

reader = None

try:
reader = InputReader_SEM(MAP_SOURCE, INPUT_SOURCE)

img_info = reader.retrieve_image_info(INPUT_SOURCE)
if not img_info:
logging.error('Could not retrieve image information due to unknown error. Aborting.')
exit(1)
with open(OUTPUT_PATH, 'w', encoding="utf-8") as f:
json.dump(img_info, f, indent=4, ensure_ascii=False)
reader = InputReader_SEM(MAP_SOURCE, INPUT_SOURCE, OUTPUT_PATH)
tmpdir = reader.temp_dir_path

if tmpdir:
# The case of a zipped input file
list_of_file_names = []
success_count = 0

for file_path in reader.filter_zipfile(tmpdir):
#if not file_path.is_file():
# No directory path is allowed. Only process files
#logging.debug(f"Skipping {file_path} as it is probably a directory.")
#continue
#if '__MACOSX' in str(file_path):
#logging.debug(f"Skipping macOS metadata file: {file_path}")
#continue

logging.info(f"Processing extracted file: {file_path.name}")
try:
file_name = file_path.with_suffix('').name + ".json"
reader_ = InputReader_SEM(MAP_SOURCE, file_path, file_name)
img_info = reader_.retrieve_image_info(file_path)
logging.debug(f"IMAGE_INFO: {img_info}")

if not img_info:
raise MappingAbortionError(f"Could not retrieve image information for {file_path.name}.")

OutputWriter_SEM.save_the_file(img_info, file_name)
list_of_file_names.append(file_name)
success_count += 1

except MappingAbortionError as e:
logging.warning(f"Skipping file {file_path.name} due to mapping error: {e}")
except Exception as e:
logging.exception(f"Unexpected error processing file {file_path.name}")

if success_count > 0:
logging.info(f"In total {success_count} file(s) were successfully processed.")
OutputWriter_SEM.save_to_zip(list_of_file_names, OUTPUT_PATH)
else:
raise MappingAbortionError("No files could be processed successfully. Aborting.")

else:
# The case of a single input file
logging.info("Processing input as single file.")
img_info = reader.retrieve_image_info(INPUT_SOURCE)
if not img_info:
raise MappingAbortionError("Could not retrieve image information. Aborting.")

#if not OUTPUT_PATH.lower().endswith('.json'):
#OUTPUT_PATH += '.json' # Ensure correct .json extension
#logging.warning(f"The output path has been updated to {OUTPUT_PATH} to match the correct extension.")

OutputWriter_SEM.save_the_file(img_info, OUTPUT_PATH)

#with open(OUTPUT_PATH, 'w', encoding="utf-8") as f:
#json.dump(img_info, f, indent=4, ensure_ascii=False)

except MappingAbortionError as e:
#logging.error(f"MappingAbortionError: {e}")
exit(e)

finally:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice, I was not even aware that 'finally' would run even on exit call. TIL :)

if reader:
reader.clean_up()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you need to implement the clean_up function.
Since it is likely identical for tomo and sem, it should ideally be inherited from a common base class

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a first quick solution, a separate clean_up function has been added in IO/sem/InputReader. A common Clean_up base class will be implement to unify cleanup logic across reader types.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking more in the direction of an InputReader base class that either provides the interface for the clean_up method or (more likely) even implements it, since it likely always treats a working_dir used by inputReaders in the same way. Maybe there is even more overlap, especially in regards to parser handling.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The InputReader base class has been implement in a distinct branch dev_inputreader_base_class.


if __name__ == '__main__':
run_cli()
36 changes: 35 additions & 1 deletion mappingservice-plugin/integrationtests/basic.hurl
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,38 @@ DELETE {{host}}/api/v1/mappingAdministration/{{id1}}
If-Match: {{etag}}
HTTP 204

#---------------- END Test Mapping for JEOL --------------------------------------
#---------------- END Test Mapping for JEOL --------------------------------------
#---------------- BEGIN Test Mapping for ZIP FILE --------------------------------------

# Execute mapping with sample ZIP file
POST {{host}}/api/v1/mappingAdministration/
Content-Type: multipart/form-data; boundary=boundary
[Options] #set all variables you need for further use in this hurl script
variable: id1=3
```
--boundary
Content-Disposition: form-data; name="record"; filename="blob"
Content-Type: application/zip

{"mappingId":"{{id1}}","mappingType": "{{mappingType}}","title":"Zeiss from CI test","description":"","acl":[]}
--boundary
Content-Disposition: form-data; name="document"; filename="blob"
Content-Type: application/json

{"Sem":"entry.instrument.instrumentName"}
--boundary--
```
HTTP 201

[Captures]
id: jsonpath "$.mappingId"

# Execute mapping with sample ZIP file
POST {{host}}/api/v1/mappingExecution/{{id1}}
[MultipartFormData]
document: file,./tests/sampleData/Archive.zip;
HTTP 200
[Asserts]
header "Content-Type" contains "application/zip"

#---------------- END Test Mapping for ZIP FILE --------------------------------------
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,12 @@ public String uri() {

@Override
public MimeType[] inputTypes() {
return new MimeType[]{MimeTypeUtils.parseMimeType("image/tiff"), MimeTypeUtils.parseMimeType("text/plain")};
return new MimeType[]{MimeTypeUtils.parseMimeType("image/tiff"), MimeTypeUtils.parseMimeType("text/plain"), MimeTypeUtils.parseMimeType("application/zip")};
}

@Override
public MimeType[] outputTypes() {
return new MimeType[]{MimeTypeUtils.APPLICATION_JSON};
return new MimeType[]{MimeTypeUtils.APPLICATION_JSON, MimeTypeUtils.parseMimeType("application/zip")};
}

@Override
Expand Down
97 changes: 90 additions & 7 deletions src/IO/sem/InputReader.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,110 @@
import logging
import mimetypes
import os
import shutil
from pathlib import Path

from src.IO.MappingAbortionError import MappingAbortionError
from src.parser.ImageParser import ParserMode
from src.parser.ParserFactory import ParserFactory
from src.util import load_json, get_filetype_with_magica, robust_textfile_read
from src.util import is_zipfile, extract_zip_file, load_json, get_filetype_with_magica, robust_textfile_read


class InputReader:

mapping = None
parser_names = None
temp_dir_path: str = None

def __init__(self, map_path, input_path):
def __init__(self, map_path, input_path, output_path):
logging.info("Preparing parsers based on parsing map file and input.")
self.mapping = load_json(map_path)

if not os.path.exists(input_path):
logging.error("Input file {} does not exist. Aborting".format(input_path))
raise MappingAbortionError("Input file loading failed.")

if is_zipfile(input_path):
# THIS PART CHECKS WHETHER THE USER DID NOT SPECIFIED THE CORRECT OUTPUT_PATH EXTENSION,
# PARTICULARLY PREVENTING MISUSE OF THE .JSON EXTENSION, WHICH CAN BE EASILY MISTAKEN.
# THIS CHECK CAN BE IMPROVE FOR A STANDALONE USE, BUT WE STICK ON THIS SIMPLE CHECK BECAUSE THE MAPPING SERVICE RENAMES THE IO FILES AND HANDLES CORRECTLY THE EXTENSIONS
#if output_path.lower().endswith('.json'):
#logging.error("The output path {} is expecting the extension '.zip' since the input is a zip file".format(output_path))
#raise MappingAbortionError("Input file parsing aborted.")
#self.temp_dir_path = extract_zip_file(input_path)
self._handle_zip_input(input_path, output_path)
else:
#if not output_path.lower().endswith('.json'):
#logging.warning("The output path {} is expecting the extension '.json'.".format(output_path))
#self.parser_names = self.get_applicable_parsers(input_path)

#if not self.parser_names:
#logging.error("No applicable parsers found for input {}".format(input_path))
#mimetype_set = list(set([v.expected_input_format() for v in ParserFactory.available_img_parsers.values()]))
#logging.info("Supported mimetypes: {}".format(mimetype_set))
#raise MappingAbortionError("Input file parsing aborted.")
#logging.info("Applicable parsers: {}".format(", ".join(self.parser_names)))
self._handle_single_input(input_path)


def _handle_zip_input(self, input_path: str, output_path: str):
"""
Handles zipped input files: Extract and detect applicable parsers.
"""
# THIS PART CHECKS WHETHER THE USER DID NOT SPECIFIED THE CORRECT OUTPUT_PATH EXTENSION,
# PARTICULARLY PREVENTING MISUSE OF THE .JSON EXTENSION, WHICH CAN BE EASILY MISTAKEN.
# THIS CHECK CAN BE IMPROVE FOR A STANDALONE USE, BUT WE STICK ON THIS SIMPLE CHECK BECAUSE THE MAPPING SERVICE RENAMES THE IO FILES AND HANDLES CORRECTLY THE EXTENSIONS
if output_path.lower().endswith('.json'):
logging.error(f"Expected '.zip' output path for zipped input '{input_path}', got '.json' instead.")
raise MappingAbortionError("Output path extension mismatch for zipped input.")

self.temp_dir_path = extract_zip_file(input_path)
found_valid_parser = False

for file_path in self.filter_zipfile(self.temp_dir_path):
self.parser_names = self.get_applicable_parsers(file_path)
if self.parser_names:
found_valid_parser = True
logging.info("Valid parsers found for files in the zip archive.")
break

if not found_valid_parser:
logging.warning("There is no valid files in the zipped input file !")
self._log_no_parser_error(input_path)

def _handle_single_input(self, input_path: str):
"""
Handles single input file: Detect applicable parsers.
"""
self.parser_names = self.get_applicable_parsers(input_path)

if not self.parser_names:
logging.error("No applicable parsers found for input {}".format(input_path))
mimetype_set = list(set([v.expected_input_format() for v in ParserFactory.available_img_parsers.values()]))
logging.info("Supported mimetypes: {}".format(mimetype_set))
raise MappingAbortionError("Input file parsing aborted.")
logging.info("Applicable parsers: {}".format(", ".join(self.parser_names)))
self._log_no_parser_error(input_path)
else:
logging.info("Applicable parsers: {}".format(", ".join(self.parser_names)))

def filter_zipfile(self, tmpdir: str):
valid_filePaths = []
for file_path in Path(tmpdir).rglob('*'):
if not file_path.is_file():
# No directory path is allowed. Only process files
logging.debug(f"Skipping {file_path} as it is probably a directory.")
continue
if '__MACOSX' in str(file_path):
logging.debug(f"Skipping macOS metadata file: {file_path}")
continue
valid_filePaths.append(file_path)
return valid_filePaths

@staticmethod
def _log_no_parser_error(input_path: str):
"""
Logs a detailed error when no parser is found.
"""
logging.error("No applicable parsers found for input {}".format(input_path))
mimetype_set = list(set([v.expected_input_format() for v in ParserFactory.available_img_parsers.values()]))
logging.info("Supported mimetypes: {}".format(mimetype_set))
raise MappingAbortionError("Input file parsing aborted.")

@staticmethod
def get_applicable_parsers(input_path, by_extension = False):
Expand All @@ -50,6 +125,7 @@ def get_applicable_parsers(input_path, by_extension = False):
#Text files are tricky with magica, so try to read as such first
mt = get_filetype_with_magica(input_path)
logging.debug("Magika file identification result: {}".format(mt))
# This part tends to also position TxtParser as a fallback solution. Need to be improved.
if mt not in applicable_types:
try:
robust_textfile_read(input_path)
Expand Down Expand Up @@ -82,3 +158,10 @@ def retrieve_image_info(self, input_path):
if result and result.image_metadata:
output_dict = result.image_metadata.to_schema_dict()
return output_dict

def clean_up(self):
if self.temp_dir_path:
shutil.rmtree(self.temp_dir_path)
logging.debug("Temp folder deletion: {} - {}".format(self.temp_dir_path, os.path.exists(self.temp_dir_path)))
else:
logging.debug("No temp folder used, nothing to clean up.")
45 changes: 45 additions & 0 deletions src/IO/sem/OutputWriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os
import json
import logging
import zipfile

from src.IO.MappingAbortionError import MappingAbortionError


class OutputWriter:

@staticmethod
def save_the_file(mapped_metadata, file_path):
try:
with open(file_path, 'w', encoding="utf-8") as json_file:
json.dump(mapped_metadata, json_file, indent=4, ensure_ascii=False)
logging.info("The output document has been created successfully!")
except (FileNotFoundError, PermissionError, IsADirectoryError, OSError, TypeError, ValueError) as e:
logging.error(f"Unable to save {file_path}: {e}")
raise MappingAbortionError(f"Failed to save {file_path}.")

@staticmethod
def save_to_zip(file_path_list, zip_file_path):
try:
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zf:
# "ZIP_DEFLATED" is a lossless compression algorithm, meaning no data is lost during the compression process.
for file_path in file_path_list:
try:
zf.write(file_path, os.path.basename(file_path))
logging.debug(f"Added {file_path} to zip.")
except (FileNotFoundError, PermissionError, IsADirectoryError, OSError, zipfile.BadZipFile) as e:
logging.error(f"Adding {file_path} to zip was not successful: {e}")
raise MappingAbortionError(f"Failed to add {file_path} to zip.")
logging.info(f"Files have been zipped into {zip_file_path} sucessfully!")
except MappingAbortionError as e:
logging.error(f"Failed to create zip file at {zip_file_path}: {e}")
raise MappingAbortionError(f"Failed to save to zip.")

# Delete the original files after zipping
for file_path in file_path_list:
try:
os.remove(file_path)
logging.info(f"{file_path} has been deleted.")
except (FileNotFoundError, PermissionError, IsADirectoryError, OSError) as e:
logging.warning(f"{file_path} to zip was not deleted: {e}")
raise MappingAbortionError(f"Failed to delete file {file_path} after zip.")
3 changes: 2 additions & 1 deletion src/parser/impl/TxtParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from PIL import Image

from src.IO.MappingAbortionError import MappingAbortionError
from src.Preprocessor import Preprocessor
from src.model.ImageMD import ImageMD
from src.parser.ImageParser import ImageParser, ParserMode
Expand Down Expand Up @@ -36,7 +37,7 @@ def parse(self, file_path, mapping) -> tuple[ImageMD, str]:

if not mapping and not self.internal_mapping:
logging.error("No mapping provided for image parsing. Aborting")
exit(1)
raise MappingAbortionError("Image parsing failed.")
mapping_dict = mapping if mapping else self.internal_mapping
image_md = map_a_dict(input_md, mapping_dict)
#print("image_md: ", image_md)
Expand Down
Loading
Loading