@@ -3705,184 +3705,83 @@ def copytree(src, dst, symlinks=False, ignore=None):
3705
3705
3706
3706
3707
3707
def generate_manifest_file_data (dataset_structure_obj ):
3708
- # modify this function here to handle paths from pennsieve
3709
- # create path using bfpath key from json object
3710
-
3711
3708
local_timezone = TZLOCAL ()
3712
-
3709
+
3713
3710
double_extensions = [
3714
- ".ome.tiff" ,
3715
- ".ome.tif" ,
3716
- ".ome.tf2," ,
3717
- ".ome.tf8" ,
3718
- ".ome.btf" ,
3719
- ".ome.xml" ,
3720
- ".brukertiff.gz" ,
3721
- ".mefd.gz" ,
3722
- ".moberg.gz" ,
3723
- ".nii.gz" ,
3724
- ".mgh.gz" ,
3725
- ".tar.gz" ,
3726
- ".bcl.gz" ,
3711
+ ".ome.tiff" , ".ome.tif" , ".ome.tf2," , ".ome.tf8" , ".ome.btf" , ".ome.xml" ,
3712
+ ".brukertiff.gz" , ".mefd.gz" , ".moberg.gz" , ".nii.gz" , ".mgh.gz" , ".tar.gz" , ".bcl.gz"
3727
3713
]
3728
3714
3729
3715
def get_name_extension (file_name ):
3730
- double_ext = False
3731
3716
for ext in double_extensions :
3732
- if file_name .find (ext ) != - 1 :
3733
- double_ext = True
3734
- break
3735
-
3736
- ext = ""
3737
-
3738
- if double_ext == False :
3739
- ext = os .path .splitext (file_name )[1 ]
3717
+ if file_name .endswith (ext ):
3718
+ # Extract the base extension before the double extension
3719
+ base_ext = os .path .splitext (os .path .splitext (file_name )[0 ])[1 ]
3720
+ return base_ext + ext
3721
+ return os .path .splitext (file_name )[1 ]
3722
+
3723
+ def build_file_entry (item , folder , ds_struct_path , timestamp_entry , file_name ):
3724
+ file_manifest_template_data = []
3725
+ filename_entry = "/" .join (ds_struct_path ) + "/" + file_name if ds_struct_path else file_name
3726
+ file_type_entry = get_name_extension (file_name )
3727
+
3728
+ if filename_entry [:1 ] == "/" :
3729
+ file_manifest_template_data .append (filename_entry [1 :])
3740
3730
else :
3741
- ext = (
3742
- os .path .splitext (os .path .splitext (file_name )[0 ])[1 ]
3743
- + os .path .splitext (file_name )[1 ]
3744
- )
3745
- return ext
3746
-
3747
- def guided_recursive_folder_traversal (folder , hlf_data_array , ds_struct_path ):
3748
- if "files" in folder .keys ():
3749
- standard_manifest_columns = ["filename" , "timestamp" , "description" , "file type" , "Additional Metadata" ]
3750
- if (len (hlf_data_array ) < 1 ):
3751
- hlf_data_array .append (standard_manifest_columns )
3752
- for item in list (folder ["files" ]):
3753
- # do not generate a manifest file entry for the manifest file itself
3754
- if item in ["manifest.xlsx" , "manifest.csv" ]:
3755
- continue
3756
- file_manifest_template_data = []
3757
- local_path_to_file = folder ["files" ][item ]["path" ].replace ("\\ " , "/" )
3758
- item_description = folder ["files" ][item ]["description" ]
3759
- item_additional_info = folder ["files" ][item ]["additional-metadata" ]
3760
-
3761
- # The name of the file eg "file.txt"
3762
- file_name = os .path .basename (local_path_to_file )
3763
- if file_name != item :
3764
- file_name = item
3765
- if len (ds_struct_path ) > 0 :
3766
- filename_entry = "/" .join (ds_struct_path ) + "/" + file_name
3767
- else :
3768
- filename_entry = file_name
3731
+ file_manifest_template_data .append (filename_entry )
3769
3732
3770
- # The extension of the file eg ".txt"
3771
- file_type_entry = get_name_extension (file_name )
3733
+ file_manifest_template_data .append (timestamp_entry )
3734
+ file_manifest_template_data .append (folder ["files" ][item ]["description" ])
3735
+ file_manifest_template_data .append (file_type_entry )
3736
+ file_manifest_template_data .append (folder ["files" ][item ]["additional-metadata" ])
3772
3737
3773
- # The timestamp of the file on the user's local machine
3774
- file_path = pathlib .Path (local_path_to_file )
3775
- mtime = file_path .stat ().st_mtime
3776
- last_mod_time = datetime .fromtimestamp (mtime , tz = local_timezone ).fromtimestamp (mtime ).astimezone (
3777
- local_timezone
3778
- )
3779
- timestamp_entry = last_mod_time .isoformat ().replace ("." , "," ).replace ("+00:00" , "Z" )
3738
+ if "extra_columns" in folder ["files" ][item ]:
3739
+ for key , value in folder ["files" ][item ]["extra_columns" ].items ():
3740
+ file_manifest_template_data .append (value )
3741
+ if key not in hlf_data_array [0 ]:
3742
+ hlf_data_array [0 ].append (key )
3780
3743
3781
- if filename_entry [:1 ] == "/" :
3782
- file_manifest_template_data .append (filename_entry [:1 ])
3783
- else :
3784
- file_manifest_template_data .append (filename_entry )
3785
-
3786
- file_manifest_template_data .append (timestamp_entry )
3787
- file_manifest_template_data .append (item_description )
3788
- file_manifest_template_data .append (file_type_entry )
3789
- file_manifest_template_data .append (item_additional_info )
3790
-
3791
- # extra column key is an object of all extra columns of a manifest
3792
- # key will be the column header and value will be the value of the column+row
3793
- # (from the excel) (now in the form of a dict)
3794
- if "extra_columns" in folder ["files" ][item ]:
3795
- for key in folder ["files" ][item ]["extra_columns" ]:
3796
- file_manifest_template_data .append (folder ["files" ][item ]["extra_columns" ][key ])
3797
- if key not in hlf_data_array [0 ]:
3798
- # add column name to manifest column names array
3799
- hlf_data_array [0 ].append (key )
3800
-
3801
- hlf_data_array .append (file_manifest_template_data )
3802
-
3803
- if "folders" in folder .keys ():
3804
- for item in list (folder ["folders" ]):
3805
- relative_structure_path .append (item )
3806
- guided_recursive_folder_traversal (
3807
- folder ["folders" ][item ], hlf_data_array , relative_structure_path
3808
- )
3809
- relative_structure_path .pop ()
3810
- return
3744
+ return file_manifest_template_data
3811
3745
3812
- def pennsieve_recursive_folder_traversal (folder , hlf_data_array , ds_struct_path ):
3813
- if "files" in folder . keys () :
3814
- standard_manifest_columns = ["filename" , "timestamp" , "description" , "file type" , "Additional Metadata" ]
3815
- if ( len ( hlf_data_array ) < 1 ) :
3746
+ def recursive_folder_traversal (folder , hlf_data_array , ds_struct_path , is_pennsieve ):
3747
+ if "files" in folder :
3748
+ standard_manifest_columns = ["filename" , "timestamp" , "description" , "file type" , "entity" , "data modality" , "also in dataset" , "data dictionary path" , "entity is transitive" , " Additional Metadata" ]
3749
+ if not hlf_data_array :
3816
3750
hlf_data_array .append (standard_manifest_columns )
3817
- for item in list ( folder [ "files" ]):
3818
- file_manifest_template_data = []
3751
+
3752
+ for item in folder [ "files" ]:
3819
3753
if item in ["manifest.xlsx" , "manifest.csv" ]:
3820
3754
continue
3821
- item_description = folder ["files" ][item ]["description" ]
3822
- item_additional_info = folder ["files" ][item ]["additional-metadata" ]
3823
- file_name = ""
3824
- if folder ["files" ][item ]["type" ] == "bf" :
3755
+
3756
+ if is_pennsieve and folder ["files" ][item ]["type" ] == "bf" :
3825
3757
file_name = os .path .basename (item )
3826
3758
timestamp_entry = folder ["files" ][item ]["timestamp" ]
3827
3759
else :
3828
3760
local_path_to_file = folder ["files" ][item ]["path" ].replace ("\\ " , "/" )
3829
3761
file_name = os .path .basename (local_path_to_file )
3830
- file_path = pathlib .Path (local_path_to_file )
3831
- mtime = file_path .stat ().st_mtime
3832
- last_mod_time = datetime .fromtimestamp (mtime , tz = local_timezone ).fromtimestamp (mtime ).astimezone (local_timezone )
3833
- timestamp_entry = last_mod_time .isoformat ().replace ("." , "," ).replace ("+00:00" , "Z" )
3834
-
3762
+ mtime = pathlib .Path (local_path_to_file ).stat ().st_mtime
3763
+ timestamp_entry = datetime .fromtimestamp (mtime , tz = local_timezone ).isoformat ().replace ("." , "," ).replace ("+00:00" , "Z" )
3835
3764
3836
- filename_entry = "/" .join (ds_struct_path ) + "/" + file_name
3837
- file_type_entry = get_name_extension (file_name )
3765
+ hlf_data_array .append (build_file_entry (item , folder , ds_struct_path , timestamp_entry , file_name ))
3838
3766
3839
- if filename_entry [:1 ] == "/" :
3840
- file_manifest_template_data .append (filename_entry [1 :])
3841
- else :
3842
- file_manifest_template_data .append (filename_entry )
3843
- file_manifest_template_data .append (timestamp_entry )
3844
- file_manifest_template_data .append (item_description )
3845
- file_manifest_template_data .append (file_type_entry )
3846
- file_manifest_template_data .append (item_additional_info )
3847
- # extra column key is an object of all extra columns of a manifest
3848
- # key will be the column header and value will be the value of the column+row
3849
- # (from the excel) (now in the form of a dict)
3850
- if "extra_columns" in folder ["files" ][item ]:
3851
- for key in folder ["files" ][item ]["extra_columns" ]:
3852
- file_manifest_template_data .append (folder ["files" ][item ]["extra_columns" ][key ])
3853
- if key not in hlf_data_array [0 ]:
3854
- # add column name to manifest column names array
3855
- hlf_data_array [0 ].append (key )
3856
-
3857
- hlf_data_array .append (file_manifest_template_data )
3767
+ if "folders" in folder :
3768
+ for item in folder ["folders" ]:
3769
+ ds_struct_path .append (item )
3770
+ recursive_folder_traversal (folder ["folders" ][item ], hlf_data_array , ds_struct_path , is_pennsieve )
3771
+ ds_struct_path .pop ()
3858
3772
3859
- if "folders" in folder .keys ():
3860
- for item in list (folder ["folders" ]):
3861
- relative_structure_path .append (item )
3862
- pennsieve_recursive_folder_traversal (
3863
- folder ["folders" ][item ], hlf_data_array , relative_structure_path
3864
- )
3865
- relative_structure_path .pop ()
3866
- return
3867
-
3868
- # Initialize the array that the manifest data will be added to.
3869
3773
hlf_manifest_data = {}
3870
- # any additional columns created by the user will be appended with the high level folder when found
3871
3774
3775
+ namespace_logger .info ("Generating manifest file data" )
3776
+ namespace_logger .info (dataset_structure_obj )
3872
3777
3873
- # Loop through each high level folder and create a manifest data array for each.
3874
- for high_level_folder in list (dataset_structure_obj ["folders" ]):
3778
+ for high_level_folder in dataset_structure_obj ["folders" ]:
3875
3779
hlf_data_array = []
3876
-
3877
- # create an array to keep track of the path to the obj being recursed over
3878
3780
relative_structure_path = []
3879
- # hlf_data_array.append(standard_manifest_columns)
3880
3781
3881
- if "bfpath" in dataset_structure_obj ["folders" ][high_level_folder ]:
3882
- # means the json is from a pennsieve dataset
3883
- pennsieve_recursive_folder_traversal (dataset_structure_obj ["folders" ][high_level_folder ], hlf_data_array , relative_structure_path )
3884
- else :
3885
- guided_recursive_folder_traversal (dataset_structure_obj ["folders" ][high_level_folder ], hlf_data_array , relative_structure_path )
3782
+ is_pennsieve = "bfpath" in dataset_structure_obj ["folders" ][high_level_folder ]
3783
+ recursive_folder_traversal (dataset_structure_obj ["folders" ][high_level_folder ], hlf_data_array , relative_structure_path , is_pennsieve )
3784
+
3886
3785
hlf_manifest_data [high_level_folder ] = hlf_data_array
3887
3786
3888
3787
return hlf_manifest_data
0 commit comments