MHubAI · jithenece · Jul 9, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/models/bamf_nnunet_ct_lungnodules/config/default.yml b/models/bamf_nnunet_ct_lungnodules/config/default.yml
@@ -0,0 +1,33 @@
+general:
+  data_base_dir: /app/data
+  version: 1.0
+  description: default configuration for 3D semantic image segmentation of the lung and lung nodules from ct scan (dicom to dicom)
+
+execute:
+- DicomImporter
+- NiftiConverter
+- NNUnetRunnerV2
+- BamfProcessorRunner
+- DsegConverter
+- DataOrganizer
+
+modules:
+  DicomImporter:
+    source_dir: input_data
+    import_dir: sorted_data
+    sort_data: true
+    meta: 
+      mod: '%Modality'
+
+  NNUnetRunnerV2:
+    in_data: nifti:mod=ct
+
+  DsegConverter:
+    model_name: bamf_ct_lung_nodule
+    target_dicom: dicom:mod=ct
+    source_segs: nifti:mod=seg:processor=bamf
+    skip_empty_slices: True
+
+  DataOrganizer:
+    targets:
+    - dicomseg-->[i:sid]/bamf_nnunet_ct_lungnodules.seg.dcm
diff --git a/models/bamf_nnunet_ct_lungnodules/dockerfiles/Dockerfile b/models/bamf_nnunet_ct_lungnodules/dockerfiles/Dockerfile
@@ -0,0 +1,31 @@
+FROM mhubai/base:latest
+
+# FIXME: set this environment variable as a shortcut to avoid nnunet crashing the build
+# by pulling sklearn instead of scikit-learn
+# N.B. this is a known issue:
+# https://github.com/MIC-DKFZ/nnUNet/issues/1281 
+# https://github.com/MIC-DKFZ/nnUNet/pull/1209
+ENV SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True
+
+# Install nnunet version 2
+RUN pip3 install --no-cache-dir nnunetv2==2.0
+
+# Clone the main branch of MHubAI/models
+ARG MHUB_MODELS_REPO
+RUN buildutils/import_mhub_model.sh bamf_nnunet_ct_lungnodules ${MHUB_MODELS_REPO}
+
+# Pull weights into the container
+ENV WEIGHTS_DIR=/root/.nnunet/nnUNet_models/nnUNet/
+RUN mkdir -p $WEIGHTS_DIR
+ENV WEIGHTS_FN=Dataset007_Nodules.zip
+ENV WEIGHTS_URL=https://zenodo.org/record/11582738/files/$WEIGHTS_FN
+RUN wget --directory-prefix ${WEIGHTS_DIR} ${WEIGHTS_URL}
+RUN unzip ${WEIGHTS_DIR}${WEIGHTS_FN} -d ${WEIGHTS_DIR}
+RUN rm ${WEIGHTS_DIR}${WEIGHTS_FN}
+
+# specify nnunet specific environment variables
+ENV WEIGHTS_FOLDER=$WEIGHTS_DIR
+
+# Default run script
+ENTRYPOINT ["mhub.run"]
+CMD ["--config", "/app/models/bamf_nnunet_ct_lungnodules/config/default.yml"]
diff --git a/models/bamf_nnunet_ct_lungnodules/meta.json b/models/bamf_nnunet_ct_lungnodules/meta.json
@@ -0,0 +1,149 @@
+{
+  "id": "",
+  "name": "bamf_nnunet_ct_lungnodules",
+  "title": "AIMI CT Lung and Nodules",
+  "summary": {
+    "description": "An nnU-Net based model to segment Lung and Nodules (3mm-30mm) from CT scans",
+    "inputs": [
+      {
+        "label": "Input Image",
+        "description": "The CT scan of a patient.",
+        "format": "DICOM",
+        "modality": "CT",
+        "bodypartexamined": "LUNG",
+        "slicethickness": "10mm",
+        "non-contrast": true,
+        "contrast": false
+      }
+    ],
+    "outputs": [
+      {
+        "label": "Segmentation",
+        "type": "Segmentation",
+        "description": "Lung and Nodules (3mm-30mm) from CT scans",
+        "classes": [
+          "LUNG",
+          "LUNG+NODULE"
+        ]
+      }
+    ],
+    "model": {
+      "architecture": "U-net",
+      "training": "supervised",
+      "cmpapproach": "3D"
+    },
+    "data": {
+      "training": {
+        "vol_samples": 1299
+      },
+      "evaluation": {
+        "vol_samples": 114
+      },
+      "public": true,
+      "external": true
+    }
+  },
+  "details": {
+    "name": "AIMI CT Lung and Nodule",
+    "version": "2.0.0",
+    "devteam": "BAMF Health",
+    "authors": [
+      "Soni, Rahul",
+      "McCrumb, Diana",
+      "Murugesan, Gowtham Krishnan",
+      "Van Oss, Jeff"
+    ],
+    "type": "nnU-Net (U-Net structure, optimized by data-driven heuristics)",
+    "date": {
+      "code": "17.10.2023",
+      "weights": "28.08.2023",
+      "pub": "23.10.2023"
+    },
+    "cite": "Murugesan, Gowtham Krishnan, Diana McCrumb, Mariam Aboian, Tej Verma, Rahul Soni, Fatima Memon, and Jeff Van Oss. The AIMI Initiative: AI-Generated Annotations for Imaging Data Commons Collections. arXiv preprint arXiv:2310.14897 (2023).",
+    "license": {
+      "code": "MIT",
+      "weights": "CC BY-NC 4.0"
+    },
+    "publications": [
+        {
+            "title": "The AIMI Initiative: AI-Generated Annotations in IDC Collections",
+            "uri": "https://arxiv.org/abs/2310.14897"
+        }
+    ],
+    "github": "https://github.com/bamf-health/aimi-lung2-ct"
+  },
+  "info": {
+    "use": {
+      "title": "Intended Use",
+      "text": "This model is designed for analyzing thoracic CT scans to segment lung structures and nodules. It requires input images from CT scans, which are processed using deep learning methods like U-Net. The model identifies and delineates lung regions and nodules, assisting in lung cancer screening and diagnostics. "
+    },
+    "analyses": {
+      "title": "Quantitative Analyses",
+      "text": "The model's performance was assessed using the Dice Coefficient, Hausdorff distance and NSD. Source radiological images from publicly available NCI IDC collections were filtered to match the modality and region requirements. To ensure the quality of AI-generated annotations, 10% of these annotations were evaluated by radiologists. "
+    },
+    "evaluation": {
+      "title": "Evaluation Data",
+      "text": "Quantitative metrics between AI and Radiologists annotations. The model was used to segment cases 1157 from the QIN LUNG CT [1], SPIE-AAPM Lung CT Challenge [2] and NLST [3] collection. 114 of those cases were randomly selected to be reviewed and corrected by a board-certified radiologist.",
+      "tables": [
+        {
+          "label": "Dice Score",
+          "entries": {
+            "Lung": "1.0±0.0",
+            "Nodules": "0.78±0.28"
+          }
+        },
+        {
+          "label": "95% Hausdorff Distance",
+          "entries": {
+            "Lung": "0.00±0.00",
+            "Nodules": "62.07±10.54"
+          }
+        },
+        {
+          "label": "Normalized surface distance ",
+          "entries": {
+            "Lung": "0.02±0.11",
+            "Nodules": "10.54±14.43"
+          }
+        }
+      ],
+      "references": [
+        {
+          "label": "QIN LUNG CT",
+          "uri": "https://www.cancerimagingarchive.net/collection/qin-lung-ct/"
+        },
+        {
+          "label": "SPIE-AAPM Lung CT Challenge",
+          "uri": "https://www.cancerimagingarchive.net/collection/spie-aapm-lung-ct-challenge/"
+        },
+        {
+          "label": "NLST",
+          "uri": "https://www.cancerimagingarchive.net/collection/nlst/"
+        }
+
+      ]
+    },
+    "training": {
+      "title": "Training Data",
+      "text": "416 CT cases from NSCLC-Radiomics [2] and 883 CT cases from DICOM-LIDC-IDRI-Nodules [1] were used to train the model. Annotations for the lung regions in the training dataset were generated utilizing Totalsegmentator[3]",
+      "references": [
+        {
+          "label": "DICOM-LIDC-IDRI-Nodules",
+          "uri": "https://wiki.cancerimagingarchive.net/pages/viewpage.action?pageId=44499647"
+        },
+        {
+          "label": "NSCLC-Radiomics",
+          "uri": "https://www.cancerimagingarchive.net/collection/nsclc-radiomics/"
+        },
+        {
+          "label": "Totalsegmentator",
+          "uri": "https://mhub.ai/models/totalsegmentator"
+        }
+      ]
+    },
+    "limitations": {
+      "title": "Limitations",
+      "text": "The model has been trained and tested on scans acquired during clinical care of patients, so it might not be suited for a healthy population. The generalization capabilities of the model on a range of ages, genders, and ethnicities are unknown."
+    }
+  }
+}
diff --git a/models/bamf_nnunet_ct_lungnodules/utils/BamfProcessorRunner.py b/models/bamf_nnunet_ct_lungnodules/utils/BamfProcessorRunner.py
@@ -0,0 +1,152 @@
+"""
+-------------------------------------------------
+MHub - Run Module for perform postprocessing logic on segmentations.
+-------------------------------------------------
+-------------------------------------------------
+Author: Jithendra Kumar
+Email:  jithendra.kumar@bamfhealth.com
+-------------------------------------------------
+"""
+
+from mhubio.core import Instance, InstanceData
+from mhubio.core import Module, IO
+from skimage import measure
+import SimpleITK as sitk
+import numpy as np
+
+
+class BamfProcessorRunner(Module):
+
+    def max_planar_dimension(self, label_img, label_cnt):
+        """
+        Calculate the maximum planar dimension of a specific label in a 3D label image.
+
+        Args:
+            label_img (sitk.Image): The 3D label image.
+            label_cnt (int): The label number to analyze.
+
+        Returns:
+            float: The maximum size of the label in millimeters (mm) across the most planar dimension.
+        """
+        tumor = label_img == label_cnt
+
+        assert tumor.GetDimension() == 3
+        spacing = tumor.GetSpacing()
+        if spacing[0] == spacing[1] and spacing[1] != spacing[2]:
+            axis = 2
+            plane_space = spacing[0]
+        elif spacing[0] != spacing[1] and spacing[1] == spacing[2]:
+            axis = 0
+            plane_space = spacing[1]
+        else:
+            axis = 1
+            plane_space = spacing[2]
+
+        lsif = sitk.LabelShapeStatisticsImageFilter()
+        lsif.Execute(tumor)
+
+        boundingBox = np.array(lsif.GetBoundingBox(1))
+        sizes = boundingBox[3:].tolist()
+        del sizes[axis]
+        max_planar_size = plane_space * max(sizes)  # mm
+        return max_planar_size
+
+    def filter_nodules(self, label_img, min_size=3):
+        """
+        Filter lung nodules based on their size and re-label them accordingly.
+
+        Args:
+            label_img (sitk.Image): The 3D label image containing lung and nodule labels.
+            min_size (float): Minimum planar size (in mm) to retain a nodule.
+
+        Returns:
+            sitk.Image: The processed label image with nodules filtered by size.
+        """
+        label_val_lung = 1
+        label_val_nodule = 2
+        label_val_large_nodule = 3
+
+        nodules_img = label_img == label_val_nodule
+        nodule_components = sitk.ConnectedComponent(nodules_img)
+
+        nodules_to_remove = []
+
+        for lbl in range(1, sitk.GetArrayFromImage(nodule_components).max() + 1):
+            max_size = self.max_planar_dimension(nodule_components, lbl)
+
+            if max_size < min_size:
+                nodules_to_remove.append(lbl)
+                # print("Removing label", lbl, "with size", max_size)
+            elif 3 <= max_size <= 30:
+                label_img = sitk.ChangeLabel(label_img, {lbl: label_val_nodule})
+                # print("Marking label", lbl, "as Nodule (label 2) with size", max_size)
+            else:
+                label_img = sitk.ChangeLabel(label_img, {lbl: label_val_large_nodule})
+                # print("Marking label", lbl, "as Large Nodule (label 3) with size", max_size)
+
+        label_img = sitk.ChangeLabel(label_img, {label_val_nodule: label_val_lung})
+        big_nodules = sitk.ChangeLabel(nodule_components, {x: 0 for x in nodules_to_remove})
+        label_img = sitk.Mask(label_img, big_nodules > 0, label_val_nodule, label_val_lung)
+        label_img = self.n_connected(label_img)
+
+        return label_img
+
+
+    def n_connected(self, img):
+        """
+        Retain the largest connected components in a binary label image.
+
+        Args:
+            img (sitk.Image): The input binary label image.
+
+        Returns:
+            sitk.Image: The processed image with only the largest connected components retained.
+        """
+        img_data = sitk.GetArrayFromImage(img)
+        img_data_mask = np.zeros(img_data.shape)
+        img_data_mask[img_data > 0] = 1
+        img_filtered = np.zeros(img_data_mask.shape)
+        blobs_labels = measure.label(img_data_mask, background=0)
+        lbl, counts = np.unique(blobs_labels, return_counts=True)
+        lbl_dict = {}
+        for i, j in zip(lbl, counts):
+            lbl_dict[i] = j
+        sorted_dict = dict(sorted(lbl_dict.items(), key=lambda x: x[1], reverse=True))
+        count = 0
+
+        for key, value in sorted_dict.items():
+            if count >= 1 and count <= 2:
+                if count == 1:
+                    val = value
+                    img_filtered[blobs_labels == key] = 1
+                if count == 2 and value > (val * 0.2):
+                    img_filtered[blobs_labels == key] = 1
+
+            count += 1
+
+        img_data[img_filtered != 1] = 0
+        img_masked = sitk.GetImageFromArray(img_data)
+        img_masked.CopyInformation(img)
+        return img_masked
+
+    @IO.Instance()
+    @IO.Input('in_data', 'nifti:mod=seg:model=nnunet', the='input segmentations')
+    @IO.Output('out_data', 'bamf_processed.nii.gz', 'nifti:mod=seg:processor=bamf:roi=LUNG,LUNG+NODULE', data='in_data', the="lung and filtered nodules segmentation")
+    def task(self, instance: Instance, in_data: InstanceData, out_data: InstanceData) -> None:
+        """
+        Main task function that processes the input lung and nodule segmentations,
+        filters nodules based on their size, and writes the output image.
+
+        Args:
+            instance (Instance): The MHub instance for processing.
+            in_data (InstanceData): Input data containing the segmentation.
+            out_data (InstanceData): Output data path to save the processed image.
+        """
+        # Log bamf runner info
+        self.log("Running BamfProcessor on....")
+        self.log(f" > input data:  {in_data.abspath}")
+        self.log(f" > output data: {out_data.abspath}")
+
+        label_img = sitk.ReadImage(in_data.abspath)
+        filtered_label_img = self.filter_nodules(label_img, min_size=3)
+        sitk.WriteImage(filtered_label_img, out_data.abspath)