Merge pull request #238 from monarch-initiative/duplicate_patient_id

Duplicate patient
monarch-initiative · Sep 4, 2024 · 61e92ad · 61e92ad
2 parents cf42c4e + 7ecd5c3
commit 61e92ad
Show file tree

Hide file tree

Showing 6 changed files with 243 additions and 0 deletions.
diff --git a/src/gpsea/preprocessing/_patient.py b/src/gpsea/preprocessing/_patient.py
@@ -38,13 +38,23 @@ def __init__(self, patient_creator: PatientCreator[T]):
 
     def process(self, inputs: typing.Iterable[T], notepad: Notepad) -> Cohort:
         patients = []
+        patient_labels = set()
+        duplicate_pat_labels = set()
 
         for i, pp in enumerate(inputs):
             sub = notepad.add_subsection(f'patient #{i}')
             patient = self._pc.process(pp, sub)
+            if patient.labels in patient_labels:
+                duplicate_pat_labels.add(patient.labels)
+            patient_labels.add(patient.labels)
             patients.append(patient)
 
         # What happens if a sample has
+        if len(duplicate_pat_labels) > 0:
+            label_summaries = [d.label_summary() for d in duplicate_pat_labels]
+            label_summaries.sort()
+            notepad.add_error(f"Patient ID/s {', '.join(label_summaries)} have a duplicate",
+                              "Please verify every patient has an unique ID.")
 
         # We should have >1 patients in the cohort, right?
         if len(patients) <= 1:

diff --git a/tests/preprocessing/data/dup_id_test_data/pp1.json b/tests/preprocessing/data/dup_id_test_data/pp1.json
@@ -0,0 +1,54 @@
+{
+    "id": "PMID_12345",
+    "subject": {
+      "id": "Pat_1"
+    },
+    "phenotypicFeatures": [
+      {
+        "type": {
+          "id": "HP:5200338",
+          "label": "Excessive fire setting"
+        }
+      }
+    ],
+    "interpretations": [
+      {
+        "id": "Pat_1",
+        "progressStatus": "SOLVED",
+        "diagnosis": {
+          "genomicInterpretations": [
+            {
+              "subjectOrBiosampleId": "Pat_1",
+              "interpretationStatus": "CAUSATIVE",
+              "variantInterpretation": {
+                "variationDescriptor": {
+                  "geneContext": {
+                    "valueId": "HGNC:6138",
+                    "symbol": "ITGA2B"
+                  },
+                  "expressions": [
+                    {
+                      "syntax": "hgvs.c",
+                      "value": "NM_000419.3:c.3077G>A"
+                    },
+                    {
+                      "syntax": "hgvs.g",
+                      "value": "NC_000017.11:g.44372407C>T"
+                    }
+                  ],
+                  "vcfRecord": {
+                    "genomeAssembly": "hg38",
+                    "chrom": "chr17",
+                    "pos": "44372407",
+                    "ref": "C",
+                    "alt": "T"
+                  },
+                  "moleculeContext": "genomic"
+                }
+              }
+            }
+          ]
+        }
+      }
+    ]
+}
diff --git a/tests/preprocessing/data/dup_id_test_data/pp2.json b/tests/preprocessing/data/dup_id_test_data/pp2.json
@@ -0,0 +1,54 @@
+{
+    "id": "PMID_12345",
+    "subject": {
+      "id": "Pat_1"
+    },
+    "phenotypicFeatures": [
+      {
+        "type": {
+          "id": "HP:5200338",
+          "label": "Excessive fire setting"
+        }
+      }
+    ],
+    "interpretations": [
+      {
+        "id": "Pat_1",
+        "progressStatus": "SOLVED",
+        "diagnosis": {
+          "genomicInterpretations": [
+            {
+              "subjectOrBiosampleId": "Pat_1",
+              "interpretationStatus": "CAUSATIVE",
+              "variantInterpretation": {
+                "variationDescriptor": {
+                  "geneContext": {
+                    "valueId": "HGNC:6138",
+                    "symbol": "ITGA2B"
+                  },
+                  "expressions": [
+                    {
+                      "syntax": "hgvs.c",
+                      "value": "NM_000419.3:c.3077G>A"
+                    },
+                    {
+                      "syntax": "hgvs.g",
+                      "value": "NC_000017.11:g.44372407C>T"
+                    }
+                  ],
+                  "vcfRecord": {
+                    "genomeAssembly": "hg38",
+                    "chrom": "chr17",
+                    "pos": "44372407",
+                    "ref": "C",
+                    "alt": "T"
+                  },
+                  "moleculeContext": "genomic"
+                }
+              }
+            }
+          ]
+        }
+      }
+    ]
+}
diff --git a/tests/preprocessing/data/dup_id_test_data/pp3.json b/tests/preprocessing/data/dup_id_test_data/pp3.json
@@ -0,0 +1,54 @@
+{
+    "id": "PMID_67890",
+    "subject": {
+      "id": "Pat_2"
+    },
+    "phenotypicFeatures": [
+      {
+        "type": {
+          "id": "HP:5200338",
+          "label": "Excessive fire setting"
+        }
+      }
+    ],
+    "interpretations": [
+      {
+        "id": "Pat_2",
+        "progressStatus": "SOLVED",
+        "diagnosis": {
+          "genomicInterpretations": [
+            {
+              "subjectOrBiosampleId": "Pat_2",
+              "interpretationStatus": "CAUSATIVE",
+              "variantInterpretation": {
+                "variationDescriptor": {
+                  "geneContext": {
+                    "valueId": "HGNC:6138",
+                    "symbol": "ITGA2B"
+                  },
+                  "expressions": [
+                    {
+                      "syntax": "hgvs.c",
+                      "value": "NM_000419.3:c.3077G>A"
+                    },
+                    {
+                      "syntax": "hgvs.g",
+                      "value": "NC_000017.11:g.44372407C>T"
+                    }
+                  ],
+                  "vcfRecord": {
+                    "genomeAssembly": "hg38",
+                    "chrom": "chr17",
+                    "pos": "44372407",
+                    "ref": "C",
+                    "alt": "T"
+                  },
+                  "moleculeContext": "genomic"
+                }
+              }
+            }
+          ]
+        }
+      }
+    ]
+}
diff --git a/tests/preprocessing/data/dup_id_test_data/pp4.json b/tests/preprocessing/data/dup_id_test_data/pp4.json
@@ -0,0 +1,54 @@
+{
+    "id": "PMID_67890",
+    "subject": {
+      "id": "Pat_2"
+    },
+    "phenotypicFeatures": [
+      {
+        "type": {
+          "id": "HP:5200338",
+          "label": "Excessive fire setting"
+        }
+      }
+    ],
+    "interpretations": [
+      {
+        "id": "Pat_2",
+        "progressStatus": "SOLVED",
+        "diagnosis": {
+          "genomicInterpretations": [
+            {
+              "subjectOrBiosampleId": "Pat_2",
+              "interpretationStatus": "CAUSATIVE",
+              "variantInterpretation": {
+                "variationDescriptor": {
+                  "geneContext": {
+                    "valueId": "HGNC:6138",
+                    "symbol": "ITGA2B"
+                  },
+                  "expressions": [
+                    {
+                      "syntax": "hgvs.c",
+                      "value": "NM_000419.3:c.3077G>A"
+                    },
+                    {
+                      "syntax": "hgvs.g",
+                      "value": "NC_000017.11:g.44372407C>T"
+                    }
+                  ],
+                  "vcfRecord": {
+                    "genomeAssembly": "hg38",
+                    "chrom": "chr17",
+                    "pos": "44372407",
+                    "ref": "C",
+                    "alt": "T"
+                  },
+                  "moleculeContext": "genomic"
+                }
+              }
+            }
+          ]
+        }
+      }
+    ]
+}
diff --git a/tests/preprocessing/test_patient_and_cohort_creator.py b/tests/preprocessing/test_patient_and_cohort_creator.py
@@ -1,4 +1,5 @@
 import os
+import io
 
 import hpotk
 import pytest
@@ -92,3 +93,19 @@ def test_load_phenopacket(
             cohort_creator=phenopacket_cohort_creator,
         )
         print(cohort)
+
+    def test_cohort_creator(
+        self,
+        fpath_test_dir: str,
+        phenopacket_cohort_creator: CohortCreator,
+    ):
+        folder = os.path.join(fpath_test_dir, 'preprocessing', 'data', 'dup_id_test_data')
+        _, results = load_phenopacket_folder(folder, phenopacket_cohort_creator)
+
+        outfile = io.StringIO()
+        results.summarize(outfile)
+
+        actual_lines = outfile.getvalue().split(os.linesep)
+
+        expected = " Patient ID/s Pat_1[PMID_12345], Pat_2[PMID_67890] have a duplicate. Please verify every patient has an unique ID."
+        assert expected in actual_lines