Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add merge_downloaded_parts script #4

Merged
merged 1 commit into from
Dec 15, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions mdai_utils/merge_downloaded_parts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
import shutil
from pathlib import Path


def merge_downloaded_parts(part_folder, remove_zip_part_file=False):
"""
md.ai splits the data into multiple parts (~50GB each). This function
merges all the parts into one directory. A simple mv does not work because
there are duplicated folders (with different content) in the parts.

part_folder is the path to the folder containing the parts of the data.
For example:
mdai_uab_project_7YNdkRbz_images_dataset_D_vVO1L2_2023-12-14-121547_part21of21

The output folder will be parallel to the part folder, without the _partXXofYY.
"""

# Create a new directory with the dir string
part_folder_path = Path(part_folder)
out_folder_path = part_folder_path.parent
dir_string = part_folder_path.name
# Remove the part information from the dir string
re_pattern = re.compile("_part[0-9]+of[0-9]+")
dir_string = re.sub(re_pattern, "", dir_string)

# Create a new directory with the dir string
out_folder_path = part_folder_path.parent / dir_string
out_folder_path.mkdir(exist_ok=True, parents=True)

# Collect all parts folder, don't capture zip files
all_parts = out_folder_path.parent.glob(dir_string + "_part*")
all_parts = [part for part in all_parts if not part.name.endswith(".zip")]
all_parts = sorted(all_parts)
if len(all_parts) == 0:
raise ValueError(f"No part folders matching {out_folder_path}")

for part_folder in all_parts:
# Move all files from the part folder to the out folder.
print(f"Moving files from {part_folder}")
for study in part_folder.glob("*"):
for series in study.glob("*"):
destination = out_folder_path / study.name / series.name
shutil.move(str(series), str(destination))
assert (
len(list(study.glob("*"))) == 0
), f"Study folder is not empty: {study}. Aborting, do it manually for this part folder."
shutil.rmtree(str(study))
assert (
len(list(part_folder.glob("*"))) == 0
), f"Part folder is not empty: {part_folder}. Aborting, do it manually for this part folder."
shutil.rmtree(str(part_folder))
if remove_zip_part_file:
zip_file = part_folder.with_suffix(".zip")
zip_file.unlink()


if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
"--part_folder", type=str, help="The path to one (any) part folder"
)
parser.add_argument(
"--remove_zip_files",
action="store_true",
help="Remove the zip file containing the part. Default: False",
)
args = parser.parse_args()
print(args)

merge_downloaded_parts(args.part_folder, args.remove_zip_files)