Skip to content

Commit

Permalink
Merge pull request #3 from phcerdan/add_3D
Browse files Browse the repository at this point in the history
Add generating 3D volumes and masks from segmentations
  • Loading branch information
phcerdan authored Dec 14, 2023
2 parents 0a33599 + 9e6067d commit 1d2131b
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 8 deletions.
54 changes: 47 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Utility functions for MD.ai. Download and upload 2D and 3D segmentation images.

## Download data

- Download all data, dicoms and annotations.
- Download all data: dicoms and annotations.

```bash
python -m mdai_utils.download_annotations \
Expand All @@ -13,14 +13,54 @@ python -m mdai_utils.download_annotations \
--download_dicoms
```

Once dicoms are downloaded locally, just download annotations. The annotations
coming from MD.ai are json files. They will be stored in `./data`:
Dicoms will be downloaded with the default md.ai structure:

```md
- data/mdai_uab_project_L1NprBvP_images_dataset_D_odXMLm_2023-12-13-152228/
- 1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421/
- 1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453/
1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414630768.dcm
1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414645741.dcm
1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414662833.dcm
1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414677861.dcm
1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414694890.dcm
```

```bash
python -m mdai_utils.download_annotations \
--parameters myparameters.json \
-o ./data
If the option `--create_volumes` is added (to cli or the parameters file), a 3D
image will be generated in parallel to the `.dcm` files:

```md
image.nii.gz
volume_metadata.json
```

The annotations/segmentations from md.ai are stored in json file.

```md
./data/mdai_uab_project_L1NprBvP_annotations_dataset_D_odXMLm_labelgroup_G_2Jy2yZ_2023-12-13-152213.json
```
and the masks/images are stored in a folder:

```md
./data/mdai_uab_project_L1NprBvP_annotations_dataset_D_odXMLm_labelgroup_G_2Jy2yZ_2023-12-13-152213_segmentations_2023-12-14-114011/
```

with structure:

```md
mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414630768.nii.gz
mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414645741.nii.gz
mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414662833.nii.gz
mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414677861.nii.gz
mylabel__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286421__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610384286453__1.2.826.0.1.3680043.2.1125.1.75064541463040.2005072610414694890.nii.gz
pair_data.json
volumes/ # Only generated with --create_volumes option
```

---

Once dicoms are downloaded locally, and they are not changed in md.ai, do not
pass the --download_dicoms option to avoid re-downloading them.

## Upload 2D segmentations

Expand Down
206 changes: 206 additions & 0 deletions mdai_utils/download_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,190 @@ def match_folder_to_json_file(
return Path(input_path) / folder_match[0] if folder_match else None


def get_dicom_names_ordered_and_metadata(dicom_dir):
"""
Read the dicom files in a folder.
Return a list of tuples (filename, sop_instance_uid) ordered by z position.
"""
io = itk.GDCMSeriesFileNames.New() # pyright: ignore[reportGeneralTypeIssues]
io.AddSeriesRestriction("0008|0021")
io.SetUseSeriesDetails(True)
io.SetDirectory(str(dicom_dir))
io.Update()

series_uids = io.GetSeriesUIDs()

if len(series_uids) == 0:
raise ValueError(f"Found no series in folder: {dicom_dir}")
elif len(series_uids) > 1:
raise ValueError(f"Found more than one series in the same folder: {dicom_dir}")
series_id = series_uids[0]
# Ordered by z position
dicom_names_ordered = io.GetFileNames(series_id)
# Get also SOPInstanceUID for each file

# Create an instance of GDCMImageIO
gdcm_io = itk.GDCMImageIO.New() # pyright: ignore[reportGeneralTypeIssues]
gdcm_io.LoadPrivateTagsOn()

# Get SOPInstanceUID for each file
dicom_names_with_uid_ordered = []
metadict_volume = {}
for i, dicom_name in enumerate(dicom_names_ordered):
gdcm_io.SetFileName(dicom_name)
gdcm_io.ReadImageInformation()
if i == 0:
reader = (
itk.ImageFileReader.New( # pyright: ignore[reportGeneralTypeIssues]
FileName=dicom_names_ordered[0], ImageIO=gdcm_io
)
)
reader.Update()
metadict = reader.GetMetaDataDictionary()
tagkeys = metadict.GetKeys()
for tagkey in tagkeys:
if (
tagkey == "ITK_original_direction"
or tagkey == "ITK_original_spacing"
):
continue
tagvalue = metadict[tagkey]
metadict_volume[tagkey] = tagvalue

_, sop_instance_uid = gdcm_io.GetValueFromTag(
"0008|0018", ""
) # Tag for SOPInstanceUID
dicom_names_with_uid_ordered.append((dicom_name, sop_instance_uid))

# Add to the metadict_volume a new key: "slices", a dict with key SOPInstanceUID
# And values: a new dict with index, and filename
slices_dict = {}
for i, (dicom_name, sop_instance_uid) in enumerate(dicom_names_with_uid_ordered):
slices_dict[sop_instance_uid] = {"index": i, "filename": str(dicom_name)}
metadict_volume["slices"] = slices_dict

return dicom_names_with_uid_ordered, metadict_volume


def merge_slices_into3D(
pair_data_json_file, labels, output_path_mask_3d, output_path_grayscale_3d=None
):
"""
PRECONDITION: pair_data_json_file contains a list of slices of paired data:
image (dicom) and label (mask in nifti format)
We want to read all dicoms as a 3D volume, and merge all nifti masks into a single 3D mask,
with the same shape as the dicom volume.
"""

with open(pair_data_json_file) as f:
pair_data = json.load(f)

if len(pair_data) == 0:
raise ValueError(f"pair_data_json_file is empty: {pair_data_json_file}")

# We want to group all the slices sharing same StudyInstanceUID and SeriesInstanceUID
pair_data_grouped = {}
for slice in pair_data:
hash_id = slice["study__series__sop_ids"]
study_id, series_id, sop_id = hash_id.split("__")
if study_id not in pair_data_grouped:
pair_data_grouped[study_id] = {}
if series_id not in pair_data_grouped[study_id]:
pair_data_grouped[study_id][series_id] = []
slice_with_sop_id = slice.copy()
slice_with_sop_id["sop_id"] = sop_id
pair_data_grouped[study_id][series_id].append(slice_with_sop_id)

# About the ordering...
# We are going to read the original dicom files with GDCM via itk
for study_id, series_dict in pair_data_grouped.items():
for series_id, slices in series_dict.items():
# Read the dicom files
dicom_dir = Path(slices[0]["image"]).parent
(
dicom_names_with_uid_ordered,
metadict_volume,
) = get_dicom_names_ordered_and_metadata(dicom_dir)

# Read the dicom files with itk SeriesReader
filenames_ordered = [name for name, _ in dicom_names_with_uid_ordered]
reader = (
itk.ImageSeriesReader.New( # pyright: ignore[reportGeneralTypeIssues]
FileNames=filenames_ordered, ForceOrthogonalDirection=False
)
)
reader.Update()
dicom_volume = reader.GetOutput()
if output_path_grayscale_3d is not None:
output_grayscale_parent_folder = (
Path(output_path_grayscale_3d) / study_id / series_id
)
output_grayscale_parent_folder.mkdir(parents=True, exist_ok=True)
output_dicom_volume_path = (
output_grayscale_parent_folder / "image.nii.gz"
)
itk.imwrite(dicom_volume, output_dicom_volume_path)
# Save the metadata
metadict_volume_output_path = (
output_grayscale_parent_folder / "volume_metadata.json"
)
with open(metadict_volume_output_path, "w") as f:
json.dump(metadict_volume, f, indent=2)

# Ok, now we have the dicom volume and the mapping for ordered SOPInstanceUID
# We now need to create a 3D mask with the same shape as the dicom volume
# And populate it with the masks from the slices, after ordering them using the SOPInstanceUID order from the dicom_named_ordered

ordered_sop_ids = [sop_id for _, sop_id in dicom_names_with_uid_ordered]

# Create the 3D mask
output_mask_parent_folder = Path(output_path_mask_3d) / study_id / series_id
output_mask_parent_folder.mkdir(parents=True, exist_ok=True)
for label in labels:
label_volume_np = np.zeros(dicom_volume.shape, dtype=np.uint8)
for slice in slices:
# Read the mask
mask_path = slice.get(label)
if mask_path is None:
raise ValueError(
f"Could not find label {label} in slice {slice}"
)
label_image = itk.imread(mask_path)
# Assert image is 3D.
if label_image.GetImageDimension() != 3:
raise ValueError(
f"Expected that the mask is a 3D image, with Z=1 slice, found {label_image.GetImageDimension()}"
)
# Get the SOPInstanceUID
hash_id = slice["study__series__sop_ids"]
_, _, sop_id = hash_id.split("__")
# Get the index of the SOPInstanceUID in the ordered list
sop_index = ordered_sop_ids.index(sop_id)
label_image_np = itk.array_from_image(
label_image
) # The label is Z=1, Y, X
# Populate the 3D mask slice
label_volume_np[sop_index] = label_image_np.squeeze()

# Convert to itk image
label_volume = itk.image_from_array(label_volume_np)
label_volume.SetOrigin(dicom_volume.GetOrigin())
label_volume.SetSpacing(dicom_volume.GetSpacing())
label_volume.SetDirection(dicom_volume.GetDirection())

# Check label and image have the same shape
label_volume_size = np.array(label_volume.shape)
image_volume_size = np.array(dicom_volume.shape)
if not np.all(label_volume_size == image_volume_size):
raise ValueError(
f"Label and image have different shapes: {label_volume_size} != {image_volume_size}"
)

# Write the label volume
output_mask_path = output_mask_parent_folder / f"{label}.nii.gz"
itk.imwrite(label_volume, output_mask_path)


def main(args):
print(args)

Expand Down Expand Up @@ -172,6 +356,7 @@ def main(args):
"mdai_no_fixing_metadata", False
)
labels = args.labels or parameters.get("labels", [])
create_volumes = args.create_volumes or parameters.get("create_volumes", False)

mdai_label_ids = bidict(parameters.get("mdai_label_ids", {}))
if labels:
Expand Down Expand Up @@ -246,6 +431,7 @@ def main(args):
sep = "__"
hash_id = sep.join(hash_)
pair_data_entry = {}
pair_data_entry["study__series__sop_ids"] = hash_id

raw_slice_image = None
if not no_fixing_metadata:
Expand Down Expand Up @@ -307,6 +493,20 @@ def main(args):

print(f"pair_data_folder: {labels_parent_folder}")

if create_volumes:
pair_data_json_file = Path(labels_parent_folder) / "pair_data.json"
out_path_masks_3d = Path(labels_parent_folder) / "volumes"
out_path_masks_3d.mkdir(parents=True, exist_ok=True)
# Create the volumes in the original data downloaded by md.ai
# The structure is {study_id}/{series_id}/image.dcm.
# We add the 3d nifti volume and the metadata in that folder.
out_path_grayscale_3d = None
if not no_fixing_metadata and download_dicoms:
out_path_grayscale_3d = Path(match_folder) if match_folder else None
merge_slices_into3D(
pair_data_json_file, labels, out_path_masks_3d, out_path_grayscale_3d
)


def get_download_parser():
import argparse
Expand Down Expand Up @@ -371,6 +571,12 @@ def get_download_parser():
help="Labels to process. If none is set, all labels found will be processed.",
)

parser.add_argument(
"--create_volumes",
action="store_true",
help="The dataset is 3D, and we want to merge all slices and dicoms into 3D volumes.",
)

parser.add_argument(
"--parameters",
type=str,
Expand Down
3 changes: 2 additions & 1 deletion tests/test_parameters.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
"mdai_no_fixing_metadata": false,
"labels": [
"mylabel"
]
],
"create_volumes": true
}

0 comments on commit 1d2131b

Please sign in to comment.