Fix run_id for BIDS dataset

sinhaharsh · Feb 9, 2024 · eae4be7 · eae4be7
1 parent 435fc76
commit eae4be7
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 8 deletions.
diff --git a/MRdataset/bids.py b/MRdataset/bids.py
@@ -1,13 +1,13 @@
 from abc import ABC
 from pathlib import Path
-
-from protocol import BidsImagingSequence
+from re import search
 
 from MRdataset import logger
 from MRdataset.base import BaseDataset
 from MRdataset.config import VALID_BIDS_DATATYPES
 from MRdataset.dicom_utils import is_bids_file
 from MRdataset.utils import folders_with_min_files, valid_dirs, read_json
+from protocol import BidsImagingSequence
 
 
 class BidsDataset(BaseDataset, ABC):
@@ -100,6 +100,7 @@ def _process(self, folder):
         """Processes the folder and returns a list of sequences."""
         json_files = self._filter_json_files(folder)
         sequences = []
+        last_id = 0
         for i, file in enumerate(json_files):
             try:
                 seq = BidsImagingSequence(bidsfile=file, path=folder)
@@ -121,11 +122,30 @@ def _process(self, folder):
 
             # None of the datasets we processed (over 20) had run information,
             # even though BIDS allows it. So we just use run-0x for all of them.
-            run_id = f'run-{str(i + 1).zfill(2)}'
+            run_id, last_id = self.get_run_id(file, last_id)
             seq.set_session_info(subject_id=subject_id,
                                  session_id=session_id,
                                  run_id=run_id,
                                  name=name)
             if seq.is_valid():
                 sequences.append(seq)
         return sequences
+
+    @staticmethod
+    def get_run_id(filename, last_id):
+        """
+        Use regex to extract run id from filename.
+        Example filename : sub-01_ses-imagery01_task-imagery_run-01_bold.json
+        """
+        # Regular expression pattern
+        pattern = r'run-\d+'
+        # Extracting substring using regex
+        match = search(pattern, str(filename))
+
+        if match:
+            run_id = match.group(0)
+            new_id_num = int(run_id.split('-')[-1])
+        else:
+            new_id_num = last_id + 1
+            run_id = f'run-{str(new_id_num).zfill(2)}'
+        return run_id, new_id_num
diff --git a/MRdataset/dicom_utils.py b/MRdataset/dicom_utils.py
@@ -1,11 +1,11 @@
 """ Utility functions for dicom files """
 import warnings
 from pathlib import Path
+from re import search
 from typing import Union
 
 import dicom2nifti
 import pydicom
-
 from MRdataset import logger
 
 with warnings.catch_warnings():
@@ -29,6 +29,14 @@ def is_bids_file(filename: Union[str, Path]):
     # TODO: Add some criteria to skip certain files
     if 'derivatives' in str(filename):
         return False
+
+    # Regular expression pattern
+    pattern = r'sub-\d+'
+    # Extracting substring using regex
+    match = search(pattern, str(filename))
+    if not match:
+        return False
+
     return True
 
 

diff --git a/MRdataset/tests/simulate.py b/MRdataset/tests/simulate.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 
 import pydicom
-
+from MRdataset.dicom_utils import is_bids_file
 from MRdataset.tests.config import compl_data_xnat
 from MRdataset.utils import convert2ascii
 
@@ -100,16 +100,16 @@ def make_compliant_bids_dataset(num_subjects,
                                 echo_train_length,
                                 flip_angle) -> Path:
     src_dir, dest_dir = setup_directories(sample_bids_dataset())
-    json_list = list(src_dir.glob('**/*.json'))
+    json_list = filter(is_bids_file, src_dir.glob('**/*.json'))
     subject_names = set()
     i = -1
 
     while len(subject_names) < num_subjects:
         i += 1
 
         try:
-            filepath = json_list[i]
-        except IndexError:
+            filepath = next(json_list)
+        except StopIteration:
             break
 
         try: