diff --git a/cohort_creator/_utils.py b/cohort_creator/_utils.py index 8d83a11..cee2693 100644 --- a/cohort_creator/_utils.py +++ b/cohort_creator/_utils.py @@ -120,25 +120,6 @@ def filter_excluded_participants(pth: Path, participants: list[str] | None) -> N participants_df.to_csv(participants_tsv, sep="\t", index=False) -def copy_top_files(src_pth: Path, target_pth: Path, datatypes: list[str]) -> None: - """Copy top files from BIDS src_pth to BIDS target_pth.""" - top_files = ["dataset_description.json", "participants.*", "README*"] - if "func" in datatypes: - top_files.extend(["*task-*_events.tsv", "*task-*_events.json", "*task-*_bold.json"]) - if "anat" in datatypes: - top_files.append("*T1w.json") - - for top_file_ in top_files: - for f in src_pth.glob(top_file_): - if (target_pth / f.name).exists(): - cc_log.debug(f" file already present:\n '{(target_pth / f.name)}'") - continue - try: - shutil.copy(src=f, dst=target_pth, follow_symlinks=True) - except FileNotFoundError: - cc_log.error(f" Could not find file '{f}'") - - def check_tsv_content(tsv_file: Path | str) -> pd.DataFrame: tsv_file = Path(tsv_file).resolve() if not tsv_file.exists(): diff --git a/cohort_creator/copy_files.py b/cohort_creator/copy_files.py new file mode 100644 index 0000000..3ade7ea --- /dev/null +++ b/cohort_creator/copy_files.py @@ -0,0 +1,84 @@ +"""Module to handle copying data out of source datalad datasets.""" + +from __future__ import annotations + +from pathlib import Path + +from datalad import api + +from cohort_creator._utils import ( + get_filters, + list_all_files_with_filter, + no_files_found_msg, +) +from cohort_creator.logger import cc_logger + +cc_log = cc_logger() + + +def copy_top_files(src_pth: Path, target_pth: Path, datatypes: list[str]) -> None: + """Copy top files from BIDS src_pth to BIDS target_pth.""" + top_files = ["dataset_description.json", "participants.*", "README*"] + if "func" in datatypes: + top_files.extend(["*task-*_events.tsv", "*task-*_events.json", "*task-*_bold.json"]) + if "anat" in datatypes: + top_files.append("*T1w.json") + + for top_file_ in top_files: + for f in src_pth.glob(top_file_): + if (target_pth / f.name).exists(): + cc_log.debug(f" file already present:\n '{(target_pth / f.name)}'") + continue + try: + api.copy_file(path=f, target_dir=target_pth) + except FileNotFoundError: + cc_log.error(f" Could not find file '{f}'") + + +def copy_this_subject( + subject: str, + datatypes: list[str], + dataset_type: str, + src_pth: Path, + target_pth: Path, + space: str = "MNI152NLin2009cAsym", + task: str = "*", + sessions: list[str] | list[None] | None = None, + bids_filter: None | dict[str, dict[str, dict[str, str]]] = None, +) -> None: + if sessions is None: + sessions = [None] + for datatype_ in datatypes: + filters = get_filters( + dataset_type=dataset_type, datatype=datatype_, bids_filter=bids_filter + ) + files = list_all_files_with_filter( + data_pth=src_pth, + dataset_type=dataset_type, + filters=filters, + subject=subject, + sessions=sessions, + datatype=datatype_, + task=task, + space=space, + ) + if not files: + cc_log.warning(no_files_found_msg(src_pth, subject, datatype_, filters)) + continue + + cc_log.debug(f" {subject} - copying files:\n {files}") + + dataset_root = src_pth + if "derivatives" in str(dataset_root): + dataset_root = Path(str(dataset_root).split("/derivatives")[0]) + + for f in files: + sub_dirs = Path(f).parents + (target_pth / sub_dirs[0]).mkdir(exist_ok=True, parents=True) + if (target_pth / f).exists(): + cc_log.debug(f" file already present:\n '{f}'") + continue + try: + api.copy_file(path=dataset_root / f, target_dir=target_pth / sub_dirs[0]) + except FileNotFoundError: + cc_log.error(f" Could not find file '{f}' in {dataset_root}") diff --git a/cohort_creator/main.py b/cohort_creator/main.py index 21a7f48..42fd070 100644 --- a/cohort_creator/main.py +++ b/cohort_creator/main.py @@ -8,7 +8,6 @@ import itertools import json -import shutil import subprocess from pathlib import Path @@ -18,7 +17,6 @@ from cohort_creator._utils import ( add_study_tsv, - copy_top_files, create_ds_description, create_tsv_participant_session_in_datasets, dataset_path, @@ -42,16 +40,13 @@ sourcedata, ) from cohort_creator.bagelify import bagelify, new_bagel +from cohort_creator.copy_files import copy_this_subject, copy_top_files from cohort_creator.data.utils import is_known_dataset from cohort_creator.logger import cc_logger cc_log = cc_logger() -def superdataset(pth: Path) -> api.Dataset: - return api.Dataset(pth) - - def install_datasets( datasets: list[str], output_dir: Path, @@ -404,7 +399,7 @@ def construct_cohort( else: sessions = list_sessions_in_participant(data_pth / subject) - _copy_this_subject( + copy_this_subject( subject=subject, sessions=sessions, datatypes=datatypes, @@ -480,54 +475,6 @@ def _update_nipoppy_manifest(datatypes, subject, sessions, dataset_type_, output manifest.to_csv(manifest_path, index=False, na_rep="n/a") -def _copy_this_subject( - subject: str, - sessions: list[str] | list[None], - datatypes: list[str], - dataset_type: str, - task: str, - space: str, - src_pth: Path, - target_pth: Path, - bids_filter: None | dict[str, dict[str, dict[str, str]]] = None, -) -> None: - for datatype_ in datatypes: - filters = get_filters( - dataset_type=dataset_type, datatype=datatype_, bids_filter=bids_filter - ) - files = list_all_files_with_filter( - data_pth=src_pth, - dataset_type=dataset_type, - filters=filters, - subject=subject, - sessions=sessions, - datatype=datatype_, - task=task, - space=space, - ) - if not files: - cc_log.warning(no_files_found_msg(src_pth, subject, datatype_, filters)) - continue - - cc_log.debug(f" {subject} - copying files:\n {files}") - - dataset_root = src_pth - if "derivatives" in str(dataset_root): - dataset_root = Path(str(dataset_root).split("/derivatives")[0]) - - for f in files: - sub_dirs = Path(f).parents - (target_pth / sub_dirs[0]).mkdir(exist_ok=True, parents=True) - if (target_pth / f).exists(): - cc_log.debug(f" file already present:\n '{f}'") - continue - try: - shutil.copy(src=dataset_root / f, dst=target_pth / f, follow_symlinks=True) - # TODO deal with permission - except FileNotFoundError: - cc_log.error(f" Could not find file '{f}' in {dataset_root}") - - def _generate_bagel_for_cohort( output_dir: Path, dataset_names: list[str], dataset_types: list[str] ) -> None: diff --git a/tests/conftest.py b/tests/conftest.py index 92c897b..5502197 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,8 @@ import pytest from datalad import api +from cohort_creator._cli import create_yoda + def root_dir(): return Path(__file__).parent.parent @@ -20,6 +22,12 @@ def bids_examples(): @pytest.fixture +def output_dir(tmp_path): + create_yoda(output_dir=tmp_path) + return tmp_path + + +@pytest.fixture(scope="session") def install_dataset(): def _install_dataset(dataset_name: str): output_path = Path(__file__).parent / "data" / "tmp" / dataset_name diff --git a/tests/test_cohort_creator.py b/tests/test_cohort_creator.py index 3cb7d6b..4c6209b 100644 --- a/tests/test_cohort_creator.py +++ b/tests/test_cohort_creator.py @@ -3,19 +3,11 @@ from __future__ import annotations import pandas as pd -import pytest -from cohort_creator._cli import create_yoda from cohort_creator._utils import sourcedata from cohort_creator.main import construct_cohort, get_data, install_datasets -@pytest.fixture -def output_dir(tmp_path): - create_yoda(output_dir=tmp_path) - return tmp_path - - def test_install_datasets(output_dir, caplog): install_datasets( datasets=["ds000001", "foo"], diff --git a/tests/test_copy_files.py b/tests/test_copy_files.py new file mode 100644 index 0000000..34e004f --- /dev/null +++ b/tests/test_copy_files.py @@ -0,0 +1,42 @@ +"""Utilities.""" + +from __future__ import annotations + +from cohort_creator._utils import sourcedata +from cohort_creator.copy_files import copy_this_subject, copy_top_files +from cohort_creator.main import install_datasets + + +def test_copy_top_files(output_dir): + dataset_types = ["raw"] + install_datasets( + datasets=["ds000001", "foo"], output_dir=output_dir, dataset_types=dataset_types + ) + copy_top_files( + src_pth=sourcedata(output_dir) / "ds000001", + target_pth=output_dir / "study-ds000001" / "bids", + datatypes=["anat", "func"], + ) + + assert ( + output_dir / "study-ds000001" / "bids" / "task-balloonanalogrisktask_bold.json" + ).exists() + assert (output_dir / "study-ds000001" / "bids" / "README").exists() + assert (output_dir / "study-ds000001" / "bids" / "dataset_description.json").exists() + + +def test_copy_this_subject(output_dir): + dataset_types = ["raw"] + datatypes = ["anat"] + install_datasets(datasets=["ds000001"], output_dir=output_dir, dataset_types=dataset_types) + copy_this_subject( + subject="sub-01", + datatypes=datatypes, + dataset_type=dataset_types[0], + src_pth=sourcedata(output_dir) / "ds000001", + target_pth=output_dir / "study-ds000001" / "bids", + ) + + assert ( + output_dir / "study-ds000001" / "bids" / "sub-01" / "anat" / "sub-01_T1w.nii.gz" + ).exists() diff --git a/tmp.py b/tmp.py new file mode 100644 index 0000000..680647f --- /dev/null +++ b/tmp.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from cohort_creator._cli import create_yoda +from cohort_creator._utils import sourcedata +from cohort_creator.copy_files import copy_this_subject +from cohort_creator.main import install_datasets + +output_dir = Path.cwd() / "tmp" +create_yoda(output_dir) +dataset_types = ["raw"] +datatypes = ["anat"] +install_datasets(datasets=["ds000001"], output_dir=output_dir, dataset_types=dataset_types) +copy_this_subject( + subject="sub-01", + datatypes=datatypes, + dataset_type=dataset_types[0], + src_pth=sourcedata(output_dir) / "ds000001", + target_pth=output_dir / "study-ds000001" / "bids", +)