-
Notifications
You must be signed in to change notification settings - Fork 3
Create transfer
module to package spool files into tar
and upload to DANDI
#51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
a1f81aa
8114a91
07aa8e5
ad3dd76
05284de
ffce115
9f143aa
41dc406
a6938de
fe7043d
53e3397
b27dfa7
3b26e22
b67cbd2
692ef28
8029ed3
638946c
83e1bad
b438146
40c7827
bd38d6c
bfe7ab2
0475c78
b7746e2
b91ce99
9191f15
2f7792b
aa61768
85531cb
300c0b1
560e5c0
bdafa32
3d5f61b
81acaae
3d332c7
ba3719d
0b94fde
8415680
2edfac4
0aa4fb7
5113e9c
d534d4f
73fb121
4e46ee2
3d15db3
88f2130
dc2a5cc
8da0196
4937101
0a9e79c
ec900cd
a85a862
65189fb
c198bfc
66d96a7
91f8099
0991eb9
6f79772
4604aba
dce4c71
b561fa2
4cd8a74
c4116bc
ef1517a
405cfc5
53b8cd8
e084266
328e51b
48d21b1
5fc47fc
e03f0aa
b0917af
5879286
7102116
a3cadb3
e6fd12a
13006c2
143e0b0
331e510
ddf7ad0
8fb8c3c
46c5bef
367ba0b
d044e40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
""" | ||
Convert a collection of spool .dat files generated by light sheet microscopy to compressed tar files and upload with the DANDI client. | ||
""" | ||
|
||
# stdlib | ||
import os | ||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
# externals | ||
import tarfile | ||
import cyclopts | ||
import dandi.download | ||
import dandi.upload | ||
|
||
# internals | ||
from linc_convert.modalities.lsm.cli import lsm | ||
|
||
transfer = cyclopts.App(name="transfer", help_format="markdown") | ||
lsm.command(transfer) | ||
|
||
@transfer.default | ||
def dandi_transfer(input_dir, dandiset_url, subject, output_dir='.', max_size_gb=2.00, upload=False): | ||
""" | ||
Upload .dat files to DANDI in batched, compressed tar archives. | ||
|
||
Parameters | ||
---------- | ||
input_dir : str | ||
Directory containing .dat files to upload | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. An alternative could be for the input to be a list of files, rather than doing the glob ourselves in the function. Dat files can still be filtered using a command-line glob: But I understand that the directory-based interface might be easier to use for Emin et al. |
||
dandiset_url : str | ||
URL for the dandiset to upload (e.g., https://lincbrain.org/dandiset/000010) | ||
output_dir : str, optional | ||
Directory to save the Dandiset directory (default: '.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We delete everyting in the end, right? Should we just use a |
||
max_size_gb : float, optional | ||
Maximum size for each archive in GB (default: 2) | ||
upload : bool, optional | ||
Upload data to DANDI (default: True) | ||
""" | ||
|
||
max_size_bytes = int(max_size_gb * 1024 * 1024 * 1024) | ||
|
||
dandi.download.download(dandiset_url, output_dir=output_dir) | ||
|
||
dandiset_id = dandiset_url.split('/')[-1] | ||
dandiset_directory = f'{output_dir}/{dandiset_id}' | ||
|
||
if not os.path.exists(f'{dandiset_directory}/dataset_description.json'): | ||
with open(f'{dandiset_directory}/dataset_description.json', 'w') as f: | ||
f.write('{}') | ||
|
||
archive_directory = f'{dandiset_directory}/sourcedata/sub-{subject}' | ||
os.makedirs(archive_directory, exist_ok=True) | ||
|
||
dat_files = list(Path(input_dir).glob("*.dat")) | ||
dat_files_size = len(dat_files) | ||
if dat_files_size: | ||
print(f"Found {dat_files_size} .dat files in '{input_dir}'.") | ||
else: | ||
print(f"No .dat files found in '{input_dir}'.") | ||
return | ||
|
||
batch = 0 | ||
file_number = 0 | ||
|
||
while file_number < dat_files_size: | ||
|
||
print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Creating archive batch {batch}") | ||
|
||
archive_path = os.path.join(archive_directory, f"sub-{subject}_desc-batch{batch}.tar") | ||
archive = tarfile.open(archive_path, "w") | ||
kabilar marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
batch_size = 0 | ||
batch_files = 0 | ||
|
||
while batch_size < max_size_bytes and file_number < dat_files_size: | ||
|
||
file_path = dat_files[file_number] | ||
file_size = os.path.getsize(file_path) | ||
|
||
print(f"Adding '{file_path.name}' ({file_size/1024**2:.2f}MB, {file_number}) to archive.") | ||
archive.add(file_path, arcname=file_path.name) | ||
|
||
batch_size += file_size | ||
batch_files += 1 | ||
file_number += 1 | ||
|
||
archive.close() | ||
|
||
print(f"Archive created with {batch_files} files and {batch_size / 1024**2:.2f}MB size.") | ||
|
||
if upload: | ||
print(f"Uploading {archive_path}.") | ||
|
||
if 'lincbrain.org' in dandiset_url: | ||
dandi_instance = 'linc' | ||
elif 'dandiarchive.org' in dandiset_url: | ||
dandi_instance = 'dandi' | ||
else: | ||
raise ValueError(f"Unknown DANDI instance: {dandiset_url}") | ||
|
||
success = False | ||
while not success: | ||
try: | ||
dandi.upload.upload([dandiset_directory], | ||
dandi_instance=dandi_instance, | ||
) | ||
success = True | ||
print("Upload successful.") | ||
except Exception as e: | ||
print(f"Upload failed with error: {str(e)}") | ||
|
||
os.remove(archive_path) | ||
|
||
del archive | ||
batch += 1 | ||
|
||
print(f"Progress: {file_number}/{dat_files_size} files processed ({file_number/dat_files_size*100:.2f}%).") | ||
|
||
print(f"{file_number} files uploaded successfully.") |
Uh oh!
There was an error while loading. Please reload this page.