Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions cohort_creator/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,25 +120,6 @@ def filter_excluded_participants(pth: Path, participants: list[str] | None) -> N
participants_df.to_csv(participants_tsv, sep="\t", index=False)


def copy_top_files(src_pth: Path, target_pth: Path, datatypes: list[str]) -> None:
"""Copy top files from BIDS src_pth to BIDS target_pth."""
top_files = ["dataset_description.json", "participants.*", "README*"]
if "func" in datatypes:
top_files.extend(["*task-*_events.tsv", "*task-*_events.json", "*task-*_bold.json"])
if "anat" in datatypes:
top_files.append("*T1w.json")

for top_file_ in top_files:
for f in src_pth.glob(top_file_):
if (target_pth / f.name).exists():
cc_log.debug(f" file already present:\n '{(target_pth / f.name)}'")
continue
try:
shutil.copy(src=f, dst=target_pth, follow_symlinks=True)
except FileNotFoundError:
cc_log.error(f" Could not find file '{f}'")


def check_tsv_content(tsv_file: Path | str) -> pd.DataFrame:
tsv_file = Path(tsv_file).resolve()
if not tsv_file.exists():
Expand Down
84 changes: 84 additions & 0 deletions cohort_creator/copy_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Module to handle copying data out of source datalad datasets."""

from __future__ import annotations

from pathlib import Path

from datalad import api

from cohort_creator._utils import (
get_filters,
list_all_files_with_filter,
no_files_found_msg,
)
from cohort_creator.logger import cc_logger

cc_log = cc_logger()


def copy_top_files(src_pth: Path, target_pth: Path, datatypes: list[str]) -> None:
"""Copy top files from BIDS src_pth to BIDS target_pth."""
top_files = ["dataset_description.json", "participants.*", "README*"]
if "func" in datatypes:
top_files.extend(["*task-*_events.tsv", "*task-*_events.json", "*task-*_bold.json"])
if "anat" in datatypes:
top_files.append("*T1w.json")

for top_file_ in top_files:
for f in src_pth.glob(top_file_):
if (target_pth / f.name).exists():
cc_log.debug(f" file already present:\n '{(target_pth / f.name)}'")
continue
try:
api.copy_file(path=f, target_dir=target_pth)
except FileNotFoundError:
cc_log.error(f" Could not find file '{f}'")


def copy_this_subject(
subject: str,
datatypes: list[str],
dataset_type: str,
src_pth: Path,
target_pth: Path,
space: str = "MNI152NLin2009cAsym",
task: str = "*",
sessions: list[str] | list[None] | None = None,
bids_filter: None | dict[str, dict[str, dict[str, str]]] = None,
) -> None:
if sessions is None:
sessions = [None]
for datatype_ in datatypes:
filters = get_filters(
dataset_type=dataset_type, datatype=datatype_, bids_filter=bids_filter
)
files = list_all_files_with_filter(
data_pth=src_pth,
dataset_type=dataset_type,
filters=filters,
subject=subject,
sessions=sessions,
datatype=datatype_,
task=task,
space=space,
)
if not files:
cc_log.warning(no_files_found_msg(src_pth, subject, datatype_, filters))
continue

cc_log.debug(f" {subject} - copying files:\n {files}")

dataset_root = src_pth
if "derivatives" in str(dataset_root):
dataset_root = Path(str(dataset_root).split("/derivatives")[0])

for f in files:
sub_dirs = Path(f).parents
(target_pth / sub_dirs[0]).mkdir(exist_ok=True, parents=True)
if (target_pth / f).exists():
cc_log.debug(f" file already present:\n '{f}'")
continue
try:
api.copy_file(path=dataset_root / f, target_dir=target_pth / sub_dirs[0])
except FileNotFoundError:
cc_log.error(f" Could not find file '{f}' in {dataset_root}")
57 changes: 2 additions & 55 deletions cohort_creator/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import itertools
import json
import shutil
import subprocess
from pathlib import Path

Expand All @@ -18,7 +17,6 @@

from cohort_creator._utils import (
add_study_tsv,
copy_top_files,
create_ds_description,
create_tsv_participant_session_in_datasets,
dataset_path,
Expand All @@ -42,16 +40,13 @@
sourcedata,
)
from cohort_creator.bagelify import bagelify, new_bagel
from cohort_creator.copy_files import copy_this_subject, copy_top_files
from cohort_creator.data.utils import is_known_dataset
from cohort_creator.logger import cc_logger

cc_log = cc_logger()


def superdataset(pth: Path) -> api.Dataset:
return api.Dataset(pth)


def install_datasets(
datasets: list[str],
output_dir: Path,
Expand Down Expand Up @@ -404,7 +399,7 @@ def construct_cohort(
else:
sessions = list_sessions_in_participant(data_pth / subject)

_copy_this_subject(
copy_this_subject(
subject=subject,
sessions=sessions,
datatypes=datatypes,
Expand Down Expand Up @@ -480,54 +475,6 @@ def _update_nipoppy_manifest(datatypes, subject, sessions, dataset_type_, output
manifest.to_csv(manifest_path, index=False, na_rep="n/a")


def _copy_this_subject(
subject: str,
sessions: list[str] | list[None],
datatypes: list[str],
dataset_type: str,
task: str,
space: str,
src_pth: Path,
target_pth: Path,
bids_filter: None | dict[str, dict[str, dict[str, str]]] = None,
) -> None:
for datatype_ in datatypes:
filters = get_filters(
dataset_type=dataset_type, datatype=datatype_, bids_filter=bids_filter
)
files = list_all_files_with_filter(
data_pth=src_pth,
dataset_type=dataset_type,
filters=filters,
subject=subject,
sessions=sessions,
datatype=datatype_,
task=task,
space=space,
)
if not files:
cc_log.warning(no_files_found_msg(src_pth, subject, datatype_, filters))
continue

cc_log.debug(f" {subject} - copying files:\n {files}")

dataset_root = src_pth
if "derivatives" in str(dataset_root):
dataset_root = Path(str(dataset_root).split("/derivatives")[0])

for f in files:
sub_dirs = Path(f).parents
(target_pth / sub_dirs[0]).mkdir(exist_ok=True, parents=True)
if (target_pth / f).exists():
cc_log.debug(f" file already present:\n '{f}'")
continue
try:
shutil.copy(src=dataset_root / f, dst=target_pth / f, follow_symlinks=True)
# TODO deal with permission
except FileNotFoundError:
cc_log.error(f" Could not find file '{f}' in {dataset_root}")


def _generate_bagel_for_cohort(
output_dir: Path, dataset_names: list[str], dataset_types: list[str]
) -> None:
Expand Down
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pytest
from datalad import api

from cohort_creator._cli import create_yoda


def root_dir():
return Path(__file__).parent.parent
Expand All @@ -20,6 +22,12 @@ def bids_examples():


@pytest.fixture
def output_dir(tmp_path):
create_yoda(output_dir=tmp_path)
return tmp_path


@pytest.fixture(scope="session")
def install_dataset():
def _install_dataset(dataset_name: str):
output_path = Path(__file__).parent / "data" / "tmp" / dataset_name
Expand Down
8 changes: 0 additions & 8 deletions tests/test_cohort_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,11 @@
from __future__ import annotations

import pandas as pd
import pytest

from cohort_creator._cli import create_yoda
from cohort_creator._utils import sourcedata
from cohort_creator.main import construct_cohort, get_data, install_datasets


@pytest.fixture
def output_dir(tmp_path):
create_yoda(output_dir=tmp_path)
return tmp_path


def test_install_datasets(output_dir, caplog):
install_datasets(
datasets=["ds000001", "foo"],
Expand Down
42 changes: 42 additions & 0 deletions tests/test_copy_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Utilities."""

from __future__ import annotations

from cohort_creator._utils import sourcedata
from cohort_creator.copy_files import copy_this_subject, copy_top_files
from cohort_creator.main import install_datasets


def test_copy_top_files(output_dir):
dataset_types = ["raw"]
install_datasets(
datasets=["ds000001", "foo"], output_dir=output_dir, dataset_types=dataset_types
)
copy_top_files(
src_pth=sourcedata(output_dir) / "ds000001",
target_pth=output_dir / "study-ds000001" / "bids",
datatypes=["anat", "func"],
)

assert (
output_dir / "study-ds000001" / "bids" / "task-balloonanalogrisktask_bold.json"
).exists()
assert (output_dir / "study-ds000001" / "bids" / "README").exists()
assert (output_dir / "study-ds000001" / "bids" / "dataset_description.json").exists()


def test_copy_this_subject(output_dir):
dataset_types = ["raw"]
datatypes = ["anat"]
install_datasets(datasets=["ds000001"], output_dir=output_dir, dataset_types=dataset_types)
copy_this_subject(
subject="sub-01",
datatypes=datatypes,
dataset_type=dataset_types[0],
src_pth=sourcedata(output_dir) / "ds000001",
target_pth=output_dir / "study-ds000001" / "bids",
)

assert (
output_dir / "study-ds000001" / "bids" / "sub-01" / "anat" / "sub-01_T1w.nii.gz"
).exists()
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Test test_copy_files lacks validation for non-existent files.

It would be beneficial to include a test case that validates the behavior when the expected output files do not exist after the copy_files operation. This could help in ensuring the function's robustness in handling errors or unexpected conditions.

Suggested change
).exists()
assert not (
output_dir / "study-ds000001" / "bids" / "sub-01" / "anat" / "sub-03_T1w.nii.gzz"
).exists()

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (testing): The assertion in test_copy_files may reference an incorrect file path.

The file path in the assertion (sub-03_T1w.nii.gzz) does not match the participant ID (sub-01) used in the test setup. This discrepancy could lead to false positives or negatives in test outcomes. Please verify the intended file path and participant ID.

19 changes: 19 additions & 0 deletions tmp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from pathlib import Path

from cohort_creator._cli import create_yoda
from cohort_creator._utils import sourcedata
from cohort_creator.copy_files import copy_this_subject
from cohort_creator.main import install_datasets

output_dir = Path.cwd() / "tmp"
create_yoda(output_dir)
dataset_types = ["raw"]
datatypes = ["anat"]
install_datasets(datasets=["ds000001"], output_dir=output_dir, dataset_types=dataset_types)
copy_this_subject(
subject="sub-01",
datatypes=datatypes,
dataset_type=dataset_types[0],
src_pth=sourcedata(output_dir) / "ds000001",
target_pth=output_dir / "study-ds000001" / "bids",
)