diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ddfcf3c..c0eb17e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,10 @@ name: CI -on: pull_request +on: + pull_request: + push: + branches: + - main + workflow_dispatch: jobs: ci: @@ -36,5 +41,7 @@ jobs: - name: Install the project dependencies run: poetry install --with dev --extras all - name: Run the automated tests + env: + DANDI_API_KEY: ${{ secrets.DANDI_API_KEY }} run: poetry run pytest -v working-directory: ./tests diff --git a/linc_convert/modalities/lsm/__init__.py b/linc_convert/modalities/lsm/__init__.py index c605f13d..4e823e3f 100644 --- a/linc_convert/modalities/lsm/__init__.py +++ b/linc_convert/modalities/lsm/__init__.py @@ -3,7 +3,7 @@ try: import tifffile as _ # noqa: F401 - __all__ = ["cli", "mosaic", "multi_slice", "spool"] - from . import cli, mosaic, multi_slice, spool + __all__ = ["cli", "mosaic", "multi_slice", "spool", "transfer"] + from . import cli, mosaic, multi_slice, spool, transfer except ImportError: pass diff --git a/linc_convert/modalities/lsm/transfer.py b/linc_convert/modalities/lsm/transfer.py new file mode 100644 index 00000000..f0233ed0 --- /dev/null +++ b/linc_convert/modalities/lsm/transfer.py @@ -0,0 +1,120 @@ +""" +Convert a collection of spool .dat files generated by light sheet microscopy to compressed tar files and upload with the DANDI client. +""" + +# stdlib +import os +from datetime import datetime +from pathlib import Path + +# externals +import tarfile +import cyclopts +import dandi.download +import dandi.upload + +# internals +from linc_convert.modalities.lsm.cli import lsm + +transfer = cyclopts.App(name="transfer", help_format="markdown") +lsm.command(transfer) + +@transfer.default +def dandi_transfer(input_dir, dandiset_url, subject, output_dir='.', max_size_gb=2.00, upload=False): + """ + Upload .dat files to DANDI in batched, compressed tar archives. + + Parameters + ---------- + input_dir : str + Directory containing .dat files to upload + dandiset_url : str + URL for the dandiset to upload (e.g., https://lincbrain.org/dandiset/000010) + output_dir : str, optional + Directory to save the Dandiset directory (default: '.') + max_size_gb : float, optional + Maximum size for each archive in GB (default: 2) + upload : bool, optional + Upload data to DANDI (default: True) + """ + + max_size_bytes = int(max_size_gb * 1024 * 1024 * 1024) + + dandi.download.download(dandiset_url, output_dir=output_dir) + + dandiset_id = dandiset_url.split('/')[-1] + dandiset_directory = f'{output_dir}/{dandiset_id}' + + if not os.path.exists(f'{dandiset_directory}/dataset_description.json'): + with open(f'{dandiset_directory}/dataset_description.json', 'w') as f: + f.write('{}') + + archive_directory = f'{dandiset_directory}/sourcedata/sub-{subject}' + os.makedirs(archive_directory, exist_ok=True) + + dat_files = list(Path(input_dir).glob("*.dat")) + dat_files_size = len(dat_files) + if dat_files_size: + print(f"Found {dat_files_size} .dat files in '{input_dir}'.") + else: + print(f"No .dat files found in '{input_dir}'.") + return + + batch = 0 + file_number = 0 + + while file_number < dat_files_size: + + print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Creating archive batch {batch}") + + archive_path = os.path.join(archive_directory, f"sub-{subject}_desc-batch{batch}.tar") + archive = tarfile.open(archive_path, "w") + + batch_size = 0 + batch_files = 0 + + while batch_size < max_size_bytes and file_number < dat_files_size: + + file_path = dat_files[file_number] + file_size = os.path.getsize(file_path) + + print(f"Adding '{file_path.name}' ({file_size/1024**2:.2f}MB, {file_number}) to archive.") + archive.add(file_path, arcname=file_path.name) + + batch_size += file_size + batch_files += 1 + file_number += 1 + + archive.close() + + print(f"Archive created with {batch_files} files and {batch_size / 1024**2:.2f}MB size.") + + if upload: + print(f"Uploading {archive_path}.") + + if 'lincbrain.org' in dandiset_url: + dandi_instance = 'linc' + elif 'dandiarchive.org' in dandiset_url: + dandi_instance = 'dandi' + else: + raise ValueError(f"Unknown DANDI instance: {dandiset_url}") + + success = False + while not success: + try: + dandi.upload.upload([dandiset_directory], + dandi_instance=dandi_instance, + ) + success = True + print("Upload successful.") + except Exception as e: + print(f"Upload failed with error: {str(e)}") + + os.remove(archive_path) + + del archive + batch += 1 + + print(f"Progress: {file_number}/{dat_files_size} files processed ({file_number/dat_files_size*100:.2f}%).") + + print(f"{file_number} files uploaded successfully.") diff --git a/pyproject.toml b/pyproject.toml index dd0ac250..6884e987 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ zarr = "^2.0.0" nifti-zarr = "*" # optionals glymur = { version = "*", optional = true } +dandi = { version = ">=0.68.1", optional = true } tifffile = { version = "*", optional = true } h5py = { version = "*", optional = true } scipy = { version = "*", optional = true } @@ -38,10 +39,10 @@ wkw = { version = "*", optional = true } [tool.poetry.extras] df = ["glymur"] -lsm = ["tifffile"] +lsm = ["dandi", "tifffile"] psoct = ["h5py", "scipy"] wk = ["wkw"] -all = ["glymur", "tifffile", "h5py", "scipy", "wkw"] +all = ["glymur", "dandi", "tifffile", "h5py", "scipy", "wkw"] [tool.poetry.group.dev] optional = true diff --git a/tests/test_lsm.py b/tests/test_lsm.py index 76a54c42..0d1a81db 100644 --- a/tests/test_lsm.py +++ b/tests/test_lsm.py @@ -1,10 +1,13 @@ from pathlib import Path +import filecmp import numpy as np +import os +import tarfile import tifffile from helper import _cmp_zarr_archives -from linc_convert.modalities.lsm import mosaic +from linc_convert.modalities.lsm import mosaic, transfer def _write_test_data(directory: str) -> None: @@ -26,3 +29,33 @@ def test_lsm(tmp_path): output_zarr = tmp_path / "output.zarr" mosaic.convert(str(tmp_path), out=str(output_zarr)) assert _cmp_zarr_archives(str(output_zarr), "data/lsm.zarr.zip") + +def test_transfer(): + + input_dir = './000051/sourcedata/sub-test1' + + transfer.dandi_transfer(input_dir=input_dir, + dandiset_url='https://lincbrain.org/dandiset/000051', + dandi_instance='linc', + subject='test1', + output_dir='.', + max_size_gb=0.02, + upload=False) + + extract_dir = './sub-test1' + os.mkdir(extract_dir) + + tar_files = list(Path(input_dir).glob("*.tar")) + for tar_file in tar_files: + with tarfile.open(tar_file, "r") as tar: + tar.extractall(path=extract_dir) + os.remove(tar_file) + + dirs_cmp = filecmp.dircmp(input_dir, extract_dir) + + assert len(dirs_cmp.left_only)==0 and len(dirs_cmp.right_only)==0, "Files do not match" + + input_dir_size = sum(os.path.getsize(f) for f in os.listdir(input_dir) if os.path.isfile(f)) + extract_dir_size = sum(os.path.getsize(f) for f in os.listdir(extract_dir) if os.path.isfile(f)) + + assert input_dir_size == extract_dir_size, "File sizes do not match"