From 31a9a3bdf73e7de44490aee2d89d230de2a8dd08 Mon Sep 17 00:00:00 2001 From: jacobiclark Date: Wed, 30 Oct 2024 10:52:05 -0700 Subject: [PATCH] wip: One file manifest UI update --- src/pyflask/curate/curate.py | 80 +++++++++++++++++------------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/src/pyflask/curate/curate.py b/src/pyflask/curate/curate.py index 5477c8909..4cf69ba46 100644 --- a/src/pyflask/curate/curate.py +++ b/src/pyflask/curate/curate.py @@ -3706,49 +3706,48 @@ def copytree(src, dst, symlinks=False, ignore=None): def generate_manifest_file_data(dataset_structure_obj): local_timezone = TZLOCAL() - - double_extensions = [ - ".ome.tiff", ".ome.tif", ".ome.tf2,", ".ome.tf8", ".ome.btf", ".ome.xml", - ".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz", ".bcl.gz" + namespace_logger.info("Generating manifest file data for dataset structure:") + namespace_logger.info(dataset_structure_obj) + + manifest_headers = [ + 'filename', 'timestamp', 'description', 'file type', 'entity', 'data modality', + 'also in dataset', 'also in dataset path', 'data dictionary path', + 'entity is transitive', 'Additional Metadata' ] + manifest_data = [] def get_name_extension(file_name): + double_extensions = [".ome.tiff", ".ome.tif", ".ome.tf2,", ".ome.tf8", ".ome.btf", ".ome.xml", + ".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz", + ".bcl.gz"] for ext in double_extensions: if file_name.endswith(ext): - # Extract the base extension before the double extension base_ext = os.path.splitext(os.path.splitext(file_name)[0])[1] return base_ext + ext return os.path.splitext(file_name)[1] def build_file_entry(item, folder, ds_struct_path, timestamp_entry, file_name): - file_manifest_template_data = [] - filename_entry = "/".join(ds_struct_path) + "/" + file_name if ds_struct_path else file_name - file_type_entry = get_name_extension(file_name) - - if filename_entry[:1] == "/": - file_manifest_template_data.append(filename_entry[1:]) - else: - file_manifest_template_data.append(filename_entry) - - file_manifest_template_data.append(timestamp_entry) - file_manifest_template_data.append(folder["files"][item]["description"]) - file_manifest_template_data.append(file_type_entry) - file_manifest_template_data.append(folder["files"][item]["additional-metadata"]) + # Basic columns for a manifest entry + file_manifest_template_data = [ + "/".join(ds_struct_path) + "/" + file_name if ds_struct_path else file_name, + timestamp_entry, + folder["files"][item].get("description", ""), + get_name_extension(file_name), + folder["files"][item].get("additional-metadata", "") + ] + # Add extra columns dynamically if they exist if "extra_columns" in folder["files"][item]: for key, value in folder["files"][item]["extra_columns"].items(): + if key not in manifest_headers: + manifest_headers.append(key) file_manifest_template_data.append(value) - if key not in hlf_data_array[0]: - hlf_data_array[0].append(key) return file_manifest_template_data - def recursive_folder_traversal(folder, hlf_data_array, ds_struct_path, is_pennsieve): + def recursive_folder_traversal(folder, ds_struct_path, is_pennsieve): + # Traverse files in the folder if "files" in folder: - standard_manifest_columns = ["filename", "timestamp", "description", "file type", "entity", "data modality", "also in dataset", "data dictionary path", "entity is transitive", "Additional Metadata"] - if not hlf_data_array: - hlf_data_array.append(standard_manifest_columns) - for item in folder["files"]: if item in ["manifest.xlsx", "manifest.csv"]: continue @@ -3762,33 +3761,28 @@ def recursive_folder_traversal(folder, hlf_data_array, ds_struct_path, is_pennsi mtime = pathlib.Path(local_path_to_file).stat().st_mtime timestamp_entry = datetime.fromtimestamp(mtime, tz=local_timezone).isoformat().replace(".", ",").replace("+00:00", "Z") - hlf_data_array.append(build_file_entry(item, folder, ds_struct_path, timestamp_entry, file_name)) + manifest_data.append(build_file_entry(item, folder, ds_struct_path, timestamp_entry, file_name)) + # Recursively traverse subfolders if "folders" in folder: - for item in folder["folders"]: - ds_struct_path.append(item) - recursive_folder_traversal(folder["folders"][item], hlf_data_array, ds_struct_path, is_pennsieve) + for subfolder_name, subfolder_content in folder["folders"].items(): + ds_struct_path.append(subfolder_name) + recursive_folder_traversal(subfolder_content, ds_struct_path, is_pennsieve) ds_struct_path.pop() - hlf_manifest_data = {} - - namespace_logger.info("Generating manifest file data") - namespace_logger.info(dataset_structure_obj) - - for high_level_folder in dataset_structure_obj["folders"]: - hlf_data_array = [] - relative_structure_path = [] - - is_pennsieve = "bfpath" in dataset_structure_obj["folders"][high_level_folder] - recursive_folder_traversal(dataset_structure_obj["folders"][high_level_folder], hlf_data_array, relative_structure_path, is_pennsieve) + # Begin recursive traversal from the top-level folders + for high_level_folder, folder_content in dataset_structure_obj["folders"].items(): + is_pennsieve = "bfpath" in folder_content + recursive_folder_traversal(folder_content, [high_level_folder], is_pennsieve) - hlf_manifest_data[high_level_folder] = hlf_data_array + namespace_logger.info("Generated manifest data:") + namespace_logger.info(manifest_data) - return hlf_manifest_data + return manifest_headers, manifest_data def handle_duplicate_package_name_error(e, soda_json_structure): if "if-existing-files" in soda_json_structure["generate-dataset"] and (soda_json_structure["generate-dataset"]["if-existing-files"] == "create-duplicate") and (e.response.text== '{"type":"BadRequest","message":"package name must be unique","code":400}'): return - raise e + raise e \ No newline at end of file