kit-data-manager · gabinoumbe · Aug 22, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 29, 2025
diff --git a/README.md b/README.md
@@ -45,11 +45,13 @@ python -m mapping_cli
 
 **1. SEM Mapping**
 
-Use the `sem` subcommand for SEM mapping. The mapper expects a map file, an image or image metadata file, and a JSON output path:
+Use the `sem` subcommand for SEM mapping. The mapper expects a map file, an image, an image metadata file or a ZIP archive containing multiple such files, and an output path with a format depending on the input type (for a single input, the mapper expects a JSON output path, while for a ZIP input, it expects a ZIP output path):
 ```
-python -m mapping_cli sem -m <map_file> -i <image or metadata file> -o <json_output_path>
+python -m mapping_cli sem -m <map_file> -i <image or metadata single file or zip_file> -o <json/zip_output_path>
 ```
 
+It is worth mentioning that the ZIP output file will only contain the successful mapping files in JSON format.
+
 For further information about the necessary map file, see [Mapping README](./src/resources/maps/mapping)
 
 **2. Tomography Mapping**

diff --git a/mapping_cli.py b/mapping_cli.py
@@ -3,11 +3,13 @@
 import logging
 import os
 from sys import exit
+from pathlib import Path
 
 from src.IO.MappingAbortionError import MappingAbortionError
 from src.IO.sem.InputReader import InputReader as InputReader_SEM
 from src.IO.tomo.InputReader import InputReader as InputReader_TOMO
-from src.IO.tomo.OutputWriter import OutputWriter
+from src.IO.sem.OutputWriter import OutputWriter as OutputWriter_SEM
+from src.IO.tomo.OutputWriter import OutputWriter as OutputWriter_TOMO
 from src.resources.maps.parsing import map_from_flag
 
 # make log level configurable from ENV, defaults to info level
@@ -87,8 +89,8 @@ def run_tomo_mapper(args):
         #si = setup_infos if len(setup_infos) >= 1 else None
         #ri = run_infos if len(run_infos) >= 1 else None
 
-        output = OutputWriter.stitch_together(setup_infos, run_infos, imgs)
-        OutputWriter.writeOutput(output, OUTPUT_PATH)
+        output = OutputWriter_TOMO.stitch_together(setup_infos, run_infos, imgs)
+        OutputWriter_TOMO.writeOutput(output, OUTPUT_PATH)
     except MappingAbortionError as e:
         reader.clean_up()
         exit(e)
@@ -103,18 +105,73 @@ def run_sem_mapper(args):
     MAP_SOURCE = argdict.get('map')
     OUTPUT_PATH = argdict.get('output')
 
+    reader = None
+
     try:
-        reader = InputReader_SEM(MAP_SOURCE, INPUT_SOURCE)
-
-        img_info = reader.retrieve_image_info(INPUT_SOURCE)
-        if not img_info:
-            logging.error('Could not retrieve image information due to unknown error. Aborting.')
-            exit(1)
-        with open(OUTPUT_PATH, 'w', encoding="utf-8") as f:
-            json.dump(img_info, f, indent=4, ensure_ascii=False)
+        reader = InputReader_SEM(MAP_SOURCE, INPUT_SOURCE, OUTPUT_PATH)
+        tmpdir = reader.temp_dir_path
+
+        if tmpdir:
+            # The case of a zipped input file
+            list_of_file_names = []
+            success_count = 0
+
+            for file_path in reader.filter_zipfile(tmpdir):
+                #if not file_path.is_file():
+                    # No directory path is allowed. Only process files
+                    #logging.debug(f"Skipping {file_path} as it is probably a directory.")
+                    #continue
+                #if '__MACOSX' in str(file_path):
+                    #logging.debug(f"Skipping macOS metadata file: {file_path}")
+                    #continue
+
+                logging.info(f"Processing extracted file: {file_path.name}")
+                try:
+                    file_name = file_path.with_suffix('').name + ".json"
+                    reader_ = InputReader_SEM(MAP_SOURCE, file_path, file_name)
+                    img_info = reader_.retrieve_image_info(file_path)
+                    logging.debug(f"IMAGE_INFO: {img_info}")
+
+                    if not img_info:
+                        raise MappingAbortionError(f"Could not retrieve image information for {file_path.name}.")
+
+                    OutputWriter_SEM.save_the_file(img_info, file_name)
+                    list_of_file_names.append(file_name)
+                    success_count += 1
+
+                except MappingAbortionError as e:
+                    logging.warning(f"Skipping file {file_path.name} due to mapping error: {e}")
+                except Exception as e:
+                    logging.exception(f"Unexpected error processing file {file_path.name}")
+
+            if success_count > 0:
+                logging.info(f"In total {success_count} file(s) were successfully processed.")
+                OutputWriter_SEM.save_to_zip(list_of_file_names, OUTPUT_PATH)
+            else:
+                raise MappingAbortionError("No files could be processed successfully. Aborting.")
+
+        else:
+            # The case of a single input file
+            logging.info("Processing input as single file.")
+            img_info = reader.retrieve_image_info(INPUT_SOURCE)
+            if not img_info:
+                raise MappingAbortionError("Could not retrieve image information. Aborting.")
+
+            #if not OUTPUT_PATH.lower().endswith('.json'):
+                #OUTPUT_PATH += '.json' # Ensure correct .json extension
+                #logging.warning(f"The output path has been updated to {OUTPUT_PATH} to match the correct extension.")
+
+            OutputWriter_SEM.save_the_file(img_info, OUTPUT_PATH)
+
+            #with open(OUTPUT_PATH, 'w', encoding="utf-8") as f:
+                #json.dump(img_info, f, indent=4, ensure_ascii=False)
+
     except MappingAbortionError as e:
+        #logging.error(f"MappingAbortionError: {e}")
         exit(e)
-
+    finally:
+        if reader:
+            reader.clean_up()
 
 if __name__ == '__main__':
     run_cli()
diff --git a/mappingservice-plugin/integrationtests/basic.hurl b/mappingservice-plugin/integrationtests/basic.hurl
@@ -189,4 +189,38 @@ DELETE {{host}}/api/v1/mappingAdministration/{{id1}}
 If-Match: {{etag}}
 HTTP 204
 
-#---------------- END Test Mapping for JEOL --------------------------------------
+#---------------- END Test Mapping for JEOL --------------------------------------
+#---------------- BEGIN Test Mapping for ZIP FILE --------------------------------------
+
+# Execute mapping with sample ZIP file
+POST {{host}}/api/v1/mappingAdministration/
+Content-Type: multipart/form-data; boundary=boundary
+[Options] #set all variables you need for further use in this hurl script
+variable: id1=3
+```
+--boundary
+Content-Disposition: form-data; name="record"; filename="blob"
+Content-Type: application/zip
+
+{"mappingId":"{{id1}}","mappingType": "{{mappingType}}","title":"Zeiss from CI test","description":"","acl":[]}
+--boundary
+Content-Disposition: form-data; name="document"; filename="blob"
+Content-Type: application/json
+
+{"Sem":"entry.instrument.instrumentName"}
+--boundary--
+```
+HTTP 201
+
+[Captures]
+id: jsonpath "$.mappingId"
+
+# Execute mapping with sample ZIP file
+POST {{host}}/api/v1/mappingExecution/{{id1}}
+[MultipartFormData]
+document: file,./tests/sampleData/Archive.zip;
+HTTP 200
+[Asserts]
+header "Content-Type" contains "application/zip"
+
+#---------------- END Test Mapping for ZIP FILE --------------------------------------
diff --git a/mappingservice-plugin/src/main/java/edu/kit/datamanager/semplugin/SEMImagePlugin.java b/mappingservice-plugin/src/main/java/edu/kit/datamanager/semplugin/SEMImagePlugin.java
@@ -70,12 +70,12 @@ public String uri() {
 
     @Override
     public MimeType[] inputTypes() {
-        return new MimeType[]{MimeTypeUtils.parseMimeType("image/tiff"), MimeTypeUtils.parseMimeType("text/plain")};
+        return new MimeType[]{MimeTypeUtils.parseMimeType("image/tiff"), MimeTypeUtils.parseMimeType("text/plain"), MimeTypeUtils.parseMimeType("application/zip")};
     }
 
     @Override
     public MimeType[] outputTypes() {
-        return new MimeType[]{MimeTypeUtils.APPLICATION_JSON};
+        return new MimeType[]{MimeTypeUtils.APPLICATION_JSON, MimeTypeUtils.parseMimeType("application/zip")};
     }
 
     @Override

diff --git a/src/IO/sem/InputReader.py b/src/IO/sem/InputReader.py
@@ -1,35 +1,110 @@
 import logging
 import mimetypes
 import os
+import shutil
+from pathlib import Path
 
 from src.IO.MappingAbortionError import MappingAbortionError
 from src.parser.ImageParser import ParserMode
 from src.parser.ParserFactory import ParserFactory
-from src.util import load_json, get_filetype_with_magica, robust_textfile_read
+from src.util import is_zipfile, extract_zip_file, load_json, get_filetype_with_magica, robust_textfile_read
 
 
 class InputReader:
 
     mapping = None
     parser_names = None
+    temp_dir_path: str = None
 
-    def __init__(self, map_path, input_path):
+    def __init__(self, map_path, input_path, output_path):
         logging.info("Preparing parsers based on parsing map file and input.")
         self.mapping = load_json(map_path)
 
         if not os.path.exists(input_path):
             logging.error("Input file {} does not exist. Aborting".format(input_path))
             raise MappingAbortionError("Input file loading failed.")
+
+        if is_zipfile(input_path):
+            # THIS PART CHECKS WHETHER THE USER DID NOT SPECIFIED THE CORRECT OUTPUT_PATH EXTENSION,
+            # PARTICULARLY PREVENTING MISUSE OF THE .JSON EXTENSION, WHICH CAN BE EASILY MISTAKEN.
+            # THIS CHECK CAN BE IMPROVE FOR A STANDALONE USE, BUT WE STICK ON THIS SIMPLE CHECK BECAUSE THE MAPPING SERVICE RENAMES THE IO FILES AND HANDLES CORRECTLY THE EXTENSIONS 
+            #if output_path.lower().endswith('.json'):
+                #logging.error("The output path {} is expecting the extension '.zip' since the input is a zip file".format(output_path))
+                #raise MappingAbortionError("Input file parsing aborted.")
+            #self.temp_dir_path = extract_zip_file(input_path)
+            self._handle_zip_input(input_path, output_path)
+        else:
+            #if not output_path.lower().endswith('.json'):
+                #logging.warning("The output path {} is expecting the extension '.json'.".format(output_path))
+            #self.parser_names = self.get_applicable_parsers(input_path)
 
+            #if not self.parser_names:
+                #logging.error("No applicable parsers found for input {}".format(input_path))
+                #mimetype_set = list(set([v.expected_input_format() for v in ParserFactory.available_img_parsers.values()]))
+                #logging.info("Supported mimetypes: {}".format(mimetype_set))
+                #raise MappingAbortionError("Input file parsing aborted.")
+            #logging.info("Applicable parsers: {}".format(", ".join(self.parser_names)))
+            self._handle_single_input(input_path)
+
+
+    def _handle_zip_input(self, input_path: str, output_path: str):
+        """
+        Handles zipped input files: Extract and detect applicable parsers.
+        """
+        # THIS PART CHECKS WHETHER THE USER DID NOT SPECIFIED THE CORRECT OUTPUT_PATH EXTENSION,
+        # PARTICULARLY PREVENTING MISUSE OF THE .JSON EXTENSION, WHICH CAN BE EASILY MISTAKEN.
+        # THIS CHECK CAN BE IMPROVE FOR A STANDALONE USE, BUT WE STICK ON THIS SIMPLE CHECK BECAUSE THE MAPPING SERVICE RENAMES THE IO FILES AND HANDLES CORRECTLY THE EXTENSIONS
+        if output_path.lower().endswith('.json'):
+            logging.error(f"Expected '.zip' output path for zipped input '{input_path}', got '.json' instead.")
+            raise MappingAbortionError("Output path extension mismatch for zipped input.")
+
+        self.temp_dir_path = extract_zip_file(input_path)
+        found_valid_parser = False
+
+        for file_path in self.filter_zipfile(self.temp_dir_path):
+            self.parser_names = self.get_applicable_parsers(file_path)
+            if self.parser_names:
+                found_valid_parser = True
+                logging.info("Valid parsers found for files in the zip archive.")
+                break
+
+        if not found_valid_parser:
+            logging.warning("There is no valid files in the zipped input file !")
+            self._log_no_parser_error(input_path)
+
+    def _handle_single_input(self, input_path: str):
+        """
+        Handles single input file: Detect applicable parsers.
+        """
         self.parser_names = self.get_applicable_parsers(input_path)
 
         if not self.parser_names:
-            logging.error("No applicable parsers found for input {}".format(input_path))
-            mimetype_set = list(set([v.expected_input_format() for v in ParserFactory.available_img_parsers.values()]))
-            logging.info("Supported mimetypes: {}".format(mimetype_set))
-            raise MappingAbortionError("Input file parsing aborted.")
-        logging.info("Applicable parsers: {}".format(", ".join(self.parser_names)))
+            self._log_no_parser_error(input_path)
+        else:
+            logging.info("Applicable parsers: {}".format(", ".join(self.parser_names)))
 
+    def filter_zipfile(self, tmpdir: str):
+        valid_filePaths = []
+        for file_path in Path(tmpdir).rglob('*'):
+            if not file_path.is_file():
+                # No directory path is allowed. Only process files
+                logging.debug(f"Skipping {file_path} as it is probably a directory.")
+                continue
+            if '__MACOSX' in str(file_path):
+                logging.debug(f"Skipping macOS metadata file: {file_path}")
+                continue
+            valid_filePaths.append(file_path)
+        return valid_filePaths
+
+    @staticmethod
+    def _log_no_parser_error(input_path: str):
+        """
+        Logs a detailed error when no parser is found.
+        """
+        logging.error("No applicable parsers found for input {}".format(input_path))
+        mimetype_set = list(set([v.expected_input_format() for v in ParserFactory.available_img_parsers.values()]))
+        logging.info("Supported mimetypes: {}".format(mimetype_set))
+        raise MappingAbortionError("Input file parsing aborted.")
 
     @staticmethod
     def get_applicable_parsers(input_path, by_extension = False):
@@ -50,6 +125,7 @@ def get_applicable_parsers(input_path, by_extension = False):
             #Text files are tricky with magica, so try to read as such first
             mt = get_filetype_with_magica(input_path)
             logging.debug("Magika file identification result: {}".format(mt))
+            # This part tends to also position TxtParser as a fallback solution. Need to be improved.
             if mt not in applicable_types:
                 try:
                     robust_textfile_read(input_path)
@@ -82,3 +158,10 @@ def retrieve_image_info(self, input_path):
             if result and result.image_metadata:
                 output_dict = result.image_metadata.to_schema_dict()
                 return output_dict
+
+    def clean_up(self):
+        if self.temp_dir_path:
+            shutil.rmtree(self.temp_dir_path)
+            logging.debug("Temp folder deletion: {} - {}".format(self.temp_dir_path, os.path.exists(self.temp_dir_path)))
+        else:
+            logging.debug("No temp folder used, nothing to clean up.")
diff --git a/src/IO/sem/OutputWriter.py b/src/IO/sem/OutputWriter.py
@@ -0,0 +1,45 @@
+import os
+import json
+import logging
+import zipfile
+
+from src.IO.MappingAbortionError import MappingAbortionError
+
+
+class OutputWriter:
+
+    @staticmethod
+    def save_the_file(mapped_metadata, file_path):
+        try:
+            with open(file_path, 'w', encoding="utf-8") as json_file:
+                json.dump(mapped_metadata, json_file, indent=4, ensure_ascii=False)
+            logging.info("The output document has been created successfully!")
+        except (FileNotFoundError, PermissionError, IsADirectoryError, OSError, TypeError, ValueError) as e:
+            logging.error(f"Unable to save {file_path}: {e}")
+            raise MappingAbortionError(f"Failed to save {file_path}.")
+
+    @staticmethod
+    def save_to_zip(file_path_list, zip_file_path):
+        try:
+            with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zf:
+                # "ZIP_DEFLATED" is a lossless compression algorithm, meaning no data is lost during the compression process.
+                for file_path in file_path_list:
+                    try:
+                        zf.write(file_path, os.path.basename(file_path))
+                        logging.debug(f"Added {file_path} to zip.")
+                    except (FileNotFoundError, PermissionError, IsADirectoryError, OSError, zipfile.BadZipFile) as e:
+                        logging.error(f"Adding {file_path} to zip was not successful: {e}")
+                        raise MappingAbortionError(f"Failed to add {file_path} to zip.")
+            logging.info(f"Files have been zipped into {zip_file_path} sucessfully!")
+        except MappingAbortionError as e:
+            logging.error(f"Failed to create zip file at {zip_file_path}: {e}")
+            raise MappingAbortionError(f"Failed to save to zip.")
+
+        # Delete the original files after zipping
+        for file_path in file_path_list:
+            try:
+                os.remove(file_path)
+                logging.info(f"{file_path} has been deleted.")
+            except (FileNotFoundError, PermissionError, IsADirectoryError, OSError) as e:
+                logging.warning(f"{file_path} to zip was not deleted: {e}")
+                raise MappingAbortionError(f"Failed to delete file {file_path} after zip.")
diff --git a/src/parser/impl/TxtParser.py b/src/parser/impl/TxtParser.py
@@ -3,6 +3,7 @@
 
 from PIL import Image
 
+from src.IO.MappingAbortionError import MappingAbortionError
 from src.Preprocessor import Preprocessor
 from src.model.ImageMD import ImageMD
 from src.parser.ImageParser import ImageParser, ParserMode
@@ -36,7 +37,7 @@ def parse(self, file_path, mapping) -> tuple[ImageMD, str]:
 
         if not mapping and not self.internal_mapping:
             logging.error("No mapping provided for image parsing. Aborting")
-            exit(1)
+            raise MappingAbortionError("Image parsing failed.")
         mapping_dict = mapping if mapping else self.internal_mapping
         image_md = map_a_dict(input_md, mapping_dict)
         #print("image_md: ", image_md)