Skip to content

Commit

Permalink
wip: One file manifest UI update
Browse files Browse the repository at this point in the history
  • Loading branch information
JacobiClark committed Oct 30, 2024
1 parent 7b5aa21 commit 31a9a3b
Showing 1 changed file with 37 additions and 43 deletions.
80 changes: 37 additions & 43 deletions src/pyflask/curate/curate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3706,49 +3706,48 @@ def copytree(src, dst, symlinks=False, ignore=None):

def generate_manifest_file_data(dataset_structure_obj):
local_timezone = TZLOCAL()

double_extensions = [
".ome.tiff", ".ome.tif", ".ome.tf2,", ".ome.tf8", ".ome.btf", ".ome.xml",
".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz", ".bcl.gz"
namespace_logger.info("Generating manifest file data for dataset structure:")
namespace_logger.info(dataset_structure_obj)

manifest_headers = [
'filename', 'timestamp', 'description', 'file type', 'entity', 'data modality',
'also in dataset', 'also in dataset path', 'data dictionary path',
'entity is transitive', 'Additional Metadata'
]
manifest_data = []

def get_name_extension(file_name):
double_extensions = [".ome.tiff", ".ome.tif", ".ome.tf2,", ".ome.tf8", ".ome.btf", ".ome.xml",
".brukertiff.gz", ".mefd.gz", ".moberg.gz", ".nii.gz", ".mgh.gz", ".tar.gz",
".bcl.gz"]
for ext in double_extensions:
if file_name.endswith(ext):
# Extract the base extension before the double extension
base_ext = os.path.splitext(os.path.splitext(file_name)[0])[1]
return base_ext + ext
return os.path.splitext(file_name)[1]

def build_file_entry(item, folder, ds_struct_path, timestamp_entry, file_name):
file_manifest_template_data = []
filename_entry = "/".join(ds_struct_path) + "/" + file_name if ds_struct_path else file_name
file_type_entry = get_name_extension(file_name)

if filename_entry[:1] == "/":
file_manifest_template_data.append(filename_entry[1:])
else:
file_manifest_template_data.append(filename_entry)

file_manifest_template_data.append(timestamp_entry)
file_manifest_template_data.append(folder["files"][item]["description"])
file_manifest_template_data.append(file_type_entry)
file_manifest_template_data.append(folder["files"][item]["additional-metadata"])
# Basic columns for a manifest entry
file_manifest_template_data = [
"/".join(ds_struct_path) + "/" + file_name if ds_struct_path else file_name,
timestamp_entry,
folder["files"][item].get("description", ""),
get_name_extension(file_name),
folder["files"][item].get("additional-metadata", "")
]

# Add extra columns dynamically if they exist
if "extra_columns" in folder["files"][item]:
for key, value in folder["files"][item]["extra_columns"].items():
if key not in manifest_headers:
manifest_headers.append(key)
file_manifest_template_data.append(value)
if key not in hlf_data_array[0]:
hlf_data_array[0].append(key)

return file_manifest_template_data

def recursive_folder_traversal(folder, hlf_data_array, ds_struct_path, is_pennsieve):
def recursive_folder_traversal(folder, ds_struct_path, is_pennsieve):
# Traverse files in the folder
if "files" in folder:
standard_manifest_columns = ["filename", "timestamp", "description", "file type", "entity", "data modality", "also in dataset", "data dictionary path", "entity is transitive", "Additional Metadata"]
if not hlf_data_array:
hlf_data_array.append(standard_manifest_columns)

for item in folder["files"]:
if item in ["manifest.xlsx", "manifest.csv"]:
continue
Expand All @@ -3762,33 +3761,28 @@ def recursive_folder_traversal(folder, hlf_data_array, ds_struct_path, is_pennsi
mtime = pathlib.Path(local_path_to_file).stat().st_mtime
timestamp_entry = datetime.fromtimestamp(mtime, tz=local_timezone).isoformat().replace(".", ",").replace("+00:00", "Z")

hlf_data_array.append(build_file_entry(item, folder, ds_struct_path, timestamp_entry, file_name))
manifest_data.append(build_file_entry(item, folder, ds_struct_path, timestamp_entry, file_name))

# Recursively traverse subfolders
if "folders" in folder:
for item in folder["folders"]:
ds_struct_path.append(item)
recursive_folder_traversal(folder["folders"][item], hlf_data_array, ds_struct_path, is_pennsieve)
for subfolder_name, subfolder_content in folder["folders"].items():
ds_struct_path.append(subfolder_name)
recursive_folder_traversal(subfolder_content, ds_struct_path, is_pennsieve)
ds_struct_path.pop()

hlf_manifest_data = {}

namespace_logger.info("Generating manifest file data")
namespace_logger.info(dataset_structure_obj)

for high_level_folder in dataset_structure_obj["folders"]:
hlf_data_array = []
relative_structure_path = []

is_pennsieve = "bfpath" in dataset_structure_obj["folders"][high_level_folder]
recursive_folder_traversal(dataset_structure_obj["folders"][high_level_folder], hlf_data_array, relative_structure_path, is_pennsieve)
# Begin recursive traversal from the top-level folders
for high_level_folder, folder_content in dataset_structure_obj["folders"].items():
is_pennsieve = "bfpath" in folder_content
recursive_folder_traversal(folder_content, [high_level_folder], is_pennsieve)

hlf_manifest_data[high_level_folder] = hlf_data_array
namespace_logger.info("Generated manifest data:")
namespace_logger.info(manifest_data)

return hlf_manifest_data
return manifest_headers, manifest_data


def handle_duplicate_package_name_error(e, soda_json_structure):
if "if-existing-files" in soda_json_structure["generate-dataset"] and (soda_json_structure["generate-dataset"]["if-existing-files"] == "create-duplicate") and (e.response.text== '{"type":"BadRequest","message":"package name must be unique","code":400}'):
return

raise e
raise e

0 comments on commit 31a9a3b

Please sign in to comment.