neurodatascience · Remi-Gau · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024 · Apr 6, 2024
diff --git a/cohort_creator/_utils.py b/cohort_creator/_utils.py
@@ -120,25 +120,6 @@ def filter_excluded_participants(pth: Path, participants: list[str] | None) -> N
     participants_df.to_csv(participants_tsv, sep="\t", index=False)
 
 
-def copy_top_files(src_pth: Path, target_pth: Path, datatypes: list[str]) -> None:
-    """Copy top files from BIDS src_pth to BIDS target_pth."""
-    top_files = ["dataset_description.json", "participants.*", "README*"]
-    if "func" in datatypes:
-        top_files.extend(["*task-*_events.tsv", "*task-*_events.json", "*task-*_bold.json"])
-    if "anat" in datatypes:
-        top_files.append("*T1w.json")
-
-    for top_file_ in top_files:
-        for f in src_pth.glob(top_file_):
-            if (target_pth / f.name).exists():
-                cc_log.debug(f"      file already present:\n       '{(target_pth / f.name)}'")
-                continue
-            try:
-                shutil.copy(src=f, dst=target_pth, follow_symlinks=True)
-            except FileNotFoundError:
-                cc_log.error(f"      Could not find file '{f}'")
-
-
 def check_tsv_content(tsv_file: Path | str) -> pd.DataFrame:
     tsv_file = Path(tsv_file).resolve()
     if not tsv_file.exists():

diff --git a/cohort_creator/copy_files.py b/cohort_creator/copy_files.py
@@ -0,0 +1,84 @@
+"""Module to handle copying data out of source datalad datasets."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from datalad import api
+
+from cohort_creator._utils import (
+    get_filters,
+    list_all_files_with_filter,
+    no_files_found_msg,
+)
+from cohort_creator.logger import cc_logger
+
+cc_log = cc_logger()
+
+
+def copy_top_files(src_pth: Path, target_pth: Path, datatypes: list[str]) -> None:
+    """Copy top files from BIDS src_pth to BIDS target_pth."""
+    top_files = ["dataset_description.json", "participants.*", "README*"]
+    if "func" in datatypes:
+        top_files.extend(["*task-*_events.tsv", "*task-*_events.json", "*task-*_bold.json"])
+    if "anat" in datatypes:
+        top_files.append("*T1w.json")
+
+    for top_file_ in top_files:
+        for f in src_pth.glob(top_file_):
+            if (target_pth / f.name).exists():
+                cc_log.debug(f"      file already present:\n       '{(target_pth / f.name)}'")
+                continue
+            try:
+                api.copy_file(path=f, target_dir=target_pth)
+            except FileNotFoundError:
+                cc_log.error(f"      Could not find file '{f}'")
+
+
+def copy_this_subject(
+    subject: str,
+    datatypes: list[str],
+    dataset_type: str,
+    src_pth: Path,
+    target_pth: Path,
+    space: str = "MNI152NLin2009cAsym",
+    task: str = "*",
+    sessions: list[str] | list[None] | None = None,
+    bids_filter: None | dict[str, dict[str, dict[str, str]]] = None,
+) -> None:
+    if sessions is None:
+        sessions = [None]
+    for datatype_ in datatypes:
+        filters = get_filters(
+            dataset_type=dataset_type, datatype=datatype_, bids_filter=bids_filter
+        )
+        files = list_all_files_with_filter(
+            data_pth=src_pth,
+            dataset_type=dataset_type,
+            filters=filters,
+            subject=subject,
+            sessions=sessions,
+            datatype=datatype_,
+            task=task,
+            space=space,
+        )
+        if not files:
+            cc_log.warning(no_files_found_msg(src_pth, subject, datatype_, filters))
+            continue
+
+        cc_log.debug(f"    {subject} - copying files:\n     {files}")
+
+        dataset_root = src_pth
+        if "derivatives" in str(dataset_root):
+            dataset_root = Path(str(dataset_root).split("/derivatives")[0])
+
+        for f in files:
+            sub_dirs = Path(f).parents
+            (target_pth / sub_dirs[0]).mkdir(exist_ok=True, parents=True)
+            if (target_pth / f).exists():
+                cc_log.debug(f"      file already present:\n       '{f}'")
+                continue
+            try:
+                api.copy_file(path=dataset_root / f, target_dir=target_pth / sub_dirs[0])
+            except FileNotFoundError:
+                cc_log.error(f"      Could not find file '{f}' in {dataset_root}")
diff --git a/cohort_creator/main.py b/cohort_creator/main.py
@@ -8,7 +8,6 @@
 
 import itertools
 import json
-import shutil
 import subprocess
 from pathlib import Path
 
@@ -18,7 +17,6 @@
 
 from cohort_creator._utils import (
     add_study_tsv,
-    copy_top_files,
     create_ds_description,
     create_tsv_participant_session_in_datasets,
     dataset_path,
@@ -42,16 +40,13 @@
     sourcedata,
 )
 from cohort_creator.bagelify import bagelify, new_bagel
+from cohort_creator.copy_files import copy_this_subject, copy_top_files
 from cohort_creator.data.utils import is_known_dataset
 from cohort_creator.logger import cc_logger
 
 cc_log = cc_logger()
 
 
-def superdataset(pth: Path) -> api.Dataset:
-    return api.Dataset(pth)
-
-
 def install_datasets(
     datasets: list[str],
     output_dir: Path,
@@ -404,7 +399,7 @@ def construct_cohort(
                 else:
                     sessions = list_sessions_in_participant(data_pth / subject)
 
-                _copy_this_subject(
+                copy_this_subject(
                     subject=subject,
                     sessions=sessions,
                     datatypes=datatypes,
@@ -480,54 +475,6 @@ def _update_nipoppy_manifest(datatypes, subject, sessions, dataset_type_, output
     manifest.to_csv(manifest_path, index=False, na_rep="n/a")
 
 
-def _copy_this_subject(
-    subject: str,
-    sessions: list[str] | list[None],
-    datatypes: list[str],
-    dataset_type: str,
-    task: str,
-    space: str,
-    src_pth: Path,
-    target_pth: Path,
-    bids_filter: None | dict[str, dict[str, dict[str, str]]] = None,
-) -> None:
-    for datatype_ in datatypes:
-        filters = get_filters(
-            dataset_type=dataset_type, datatype=datatype_, bids_filter=bids_filter
-        )
-        files = list_all_files_with_filter(
-            data_pth=src_pth,
-            dataset_type=dataset_type,
-            filters=filters,
-            subject=subject,
-            sessions=sessions,
-            datatype=datatype_,
-            task=task,
-            space=space,
-        )
-        if not files:
-            cc_log.warning(no_files_found_msg(src_pth, subject, datatype_, filters))
-            continue
-
-        cc_log.debug(f"    {subject} - copying files:\n     {files}")
-
-        dataset_root = src_pth
-        if "derivatives" in str(dataset_root):
-            dataset_root = Path(str(dataset_root).split("/derivatives")[0])
-
-        for f in files:
-            sub_dirs = Path(f).parents
-            (target_pth / sub_dirs[0]).mkdir(exist_ok=True, parents=True)
-            if (target_pth / f).exists():
-                cc_log.debug(f"      file already present:\n       '{f}'")
-                continue
-            try:
-                shutil.copy(src=dataset_root / f, dst=target_pth / f, follow_symlinks=True)
-                # TODO deal with permission
-            except FileNotFoundError:
-                cc_log.error(f"      Could not find file '{f}' in {dataset_root}")
-
-
 def _generate_bagel_for_cohort(
     output_dir: Path, dataset_names: list[str], dataset_types: list[str]
 ) -> None:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -5,6 +5,8 @@
 import pytest
 from datalad import api
 
+from cohort_creator._cli import create_yoda
+
 
 def root_dir():
     return Path(__file__).parent.parent
@@ -20,6 +22,12 @@ def bids_examples():
 
 
 @pytest.fixture
+def output_dir(tmp_path):
+    create_yoda(output_dir=tmp_path)
+    return tmp_path
+
+
+@pytest.fixture(scope="session")
 def install_dataset():
     def _install_dataset(dataset_name: str):
         output_path = Path(__file__).parent / "data" / "tmp" / dataset_name

diff --git a/tests/test_cohort_creator.py b/tests/test_cohort_creator.py
@@ -3,19 +3,11 @@
 from __future__ import annotations
 
 import pandas as pd
-import pytest
 
-from cohort_creator._cli import create_yoda
 from cohort_creator._utils import sourcedata
 from cohort_creator.main import construct_cohort, get_data, install_datasets
 
 
-@pytest.fixture
-def output_dir(tmp_path):
-    create_yoda(output_dir=tmp_path)
-    return tmp_path
-
-
 def test_install_datasets(output_dir, caplog):
     install_datasets(
         datasets=["ds000001", "foo"],

diff --git a/tests/test_copy_files.py b/tests/test_copy_files.py
@@ -0,0 +1,42 @@
+"""Utilities."""
+
+from __future__ import annotations
+
+from cohort_creator._utils import sourcedata
+from cohort_creator.copy_files import copy_this_subject, copy_top_files
+from cohort_creator.main import install_datasets
+
+
+def test_copy_top_files(output_dir):
+    dataset_types = ["raw"]
+    install_datasets(
+        datasets=["ds000001", "foo"], output_dir=output_dir, dataset_types=dataset_types
+    )
+    copy_top_files(
+        src_pth=sourcedata(output_dir) / "ds000001",
+        target_pth=output_dir / "study-ds000001" / "bids",
+        datatypes=["anat", "func"],
+    )
+
+    assert (
+        output_dir / "study-ds000001" / "bids" / "task-balloonanalogrisktask_bold.json"
+    ).exists()
+    assert (output_dir / "study-ds000001" / "bids" / "README").exists()
+    assert (output_dir / "study-ds000001" / "bids" / "dataset_description.json").exists()
+
+
+def test_copy_this_subject(output_dir):
+    dataset_types = ["raw"]
+    datatypes = ["anat"]
+    install_datasets(datasets=["ds000001"], output_dir=output_dir, dataset_types=dataset_types)
+    copy_this_subject(
+        subject="sub-01",
+        datatypes=datatypes,
+        dataset_type=dataset_types[0],
+        src_pth=sourcedata(output_dir) / "ds000001",
+        target_pth=output_dir / "study-ds000001" / "bids",
+    )
+
+    assert (
+        output_dir / "study-ds000001" / "bids" / "sub-01" / "anat" / "sub-01_T1w.nii.gz"
+    ).exists()
-    ).exists()
+assert not (
+    output_dir / "study-ds000001" / "bids" / "sub-01" / "anat" / "sub-03_T1w.nii.gzz"
+).exists()
-    ).exists()
+assert not (
+    output_dir / "study-ds000001" / "bids" / "sub-01" / "anat" / "sub-03_T1w.nii.gzz"
+).exists()
diff --git a/tmp.py b/tmp.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+
+from cohort_creator._cli import create_yoda
+from cohort_creator._utils import sourcedata
+from cohort_creator.copy_files import copy_this_subject
+from cohort_creator.main import install_datasets
+
+output_dir = Path.cwd() / "tmp"
+create_yoda(output_dir)
+dataset_types = ["raw"]
+datatypes = ["anat"]
+install_datasets(datasets=["ds000001"], output_dir=output_dir, dataset_types=dataset_types)
+copy_this_subject(
+    subject="sub-01",
+    datatypes=datatypes,
+    dataset_type=dataset_types[0],
+    src_pth=sourcedata(output_dir) / "ds000001",
+    target_pth=output_dir / "study-ds000001" / "bids",
+)