diff --git a/src/gpsea/preprocessing/_patient.py b/src/gpsea/preprocessing/_patient.py index 902851e33..7a9ed95cc 100644 --- a/src/gpsea/preprocessing/_patient.py +++ b/src/gpsea/preprocessing/_patient.py @@ -38,13 +38,23 @@ def __init__(self, patient_creator: PatientCreator[T]): def process(self, inputs: typing.Iterable[T], notepad: Notepad) -> Cohort: patients = [] + patient_labels = set() + duplicate_pat_labels = set() for i, pp in enumerate(inputs): sub = notepad.add_subsection(f'patient #{i}') patient = self._pc.process(pp, sub) + if patient.labels in patient_labels: + duplicate_pat_labels.add(patient.labels) + patient_labels.add(patient.labels) patients.append(patient) # What happens if a sample has + if len(duplicate_pat_labels) > 0: + label_summaries = [d.label_summary() for d in duplicate_pat_labels] + label_summaries.sort() + notepad.add_error(f"Patient ID/s {', '.join(label_summaries)} have a duplicate", + "Please verify every patient has an unique ID.") # We should have >1 patients in the cohort, right? if len(patients) <= 1: diff --git a/tests/preprocessing/data/dup_id_test_data/pp1.json b/tests/preprocessing/data/dup_id_test_data/pp1.json new file mode 100644 index 000000000..6f81c2e4e --- /dev/null +++ b/tests/preprocessing/data/dup_id_test_data/pp1.json @@ -0,0 +1,54 @@ +{ + "id": "PMID_12345", + "subject": { + "id": "Pat_1" + }, + "phenotypicFeatures": [ + { + "type": { + "id": "HP:5200338", + "label": "Excessive fire setting" + } + } + ], + "interpretations": [ + { + "id": "Pat_1", + "progressStatus": "SOLVED", + "diagnosis": { + "genomicInterpretations": [ + { + "subjectOrBiosampleId": "Pat_1", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:6138", + "symbol": "ITGA2B" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_000419.3:c.3077G>A" + }, + { + "syntax": "hgvs.g", + "value": "NC_000017.11:g.44372407C>T" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr17", + "pos": "44372407", + "ref": "C", + "alt": "T" + }, + "moleculeContext": "genomic" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/data/dup_id_test_data/pp2.json b/tests/preprocessing/data/dup_id_test_data/pp2.json new file mode 100644 index 000000000..6f81c2e4e --- /dev/null +++ b/tests/preprocessing/data/dup_id_test_data/pp2.json @@ -0,0 +1,54 @@ +{ + "id": "PMID_12345", + "subject": { + "id": "Pat_1" + }, + "phenotypicFeatures": [ + { + "type": { + "id": "HP:5200338", + "label": "Excessive fire setting" + } + } + ], + "interpretations": [ + { + "id": "Pat_1", + "progressStatus": "SOLVED", + "diagnosis": { + "genomicInterpretations": [ + { + "subjectOrBiosampleId": "Pat_1", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:6138", + "symbol": "ITGA2B" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_000419.3:c.3077G>A" + }, + { + "syntax": "hgvs.g", + "value": "NC_000017.11:g.44372407C>T" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr17", + "pos": "44372407", + "ref": "C", + "alt": "T" + }, + "moleculeContext": "genomic" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/data/dup_id_test_data/pp3.json b/tests/preprocessing/data/dup_id_test_data/pp3.json new file mode 100644 index 000000000..5ace5173e --- /dev/null +++ b/tests/preprocessing/data/dup_id_test_data/pp3.json @@ -0,0 +1,54 @@ +{ + "id": "PMID_67890", + "subject": { + "id": "Pat_2" + }, + "phenotypicFeatures": [ + { + "type": { + "id": "HP:5200338", + "label": "Excessive fire setting" + } + } + ], + "interpretations": [ + { + "id": "Pat_2", + "progressStatus": "SOLVED", + "diagnosis": { + "genomicInterpretations": [ + { + "subjectOrBiosampleId": "Pat_2", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:6138", + "symbol": "ITGA2B" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_000419.3:c.3077G>A" + }, + { + "syntax": "hgvs.g", + "value": "NC_000017.11:g.44372407C>T" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr17", + "pos": "44372407", + "ref": "C", + "alt": "T" + }, + "moleculeContext": "genomic" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/data/dup_id_test_data/pp4.json b/tests/preprocessing/data/dup_id_test_data/pp4.json new file mode 100644 index 000000000..5ace5173e --- /dev/null +++ b/tests/preprocessing/data/dup_id_test_data/pp4.json @@ -0,0 +1,54 @@ +{ + "id": "PMID_67890", + "subject": { + "id": "Pat_2" + }, + "phenotypicFeatures": [ + { + "type": { + "id": "HP:5200338", + "label": "Excessive fire setting" + } + } + ], + "interpretations": [ + { + "id": "Pat_2", + "progressStatus": "SOLVED", + "diagnosis": { + "genomicInterpretations": [ + { + "subjectOrBiosampleId": "Pat_2", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "geneContext": { + "valueId": "HGNC:6138", + "symbol": "ITGA2B" + }, + "expressions": [ + { + "syntax": "hgvs.c", + "value": "NM_000419.3:c.3077G>A" + }, + { + "syntax": "hgvs.g", + "value": "NC_000017.11:g.44372407C>T" + } + ], + "vcfRecord": { + "genomeAssembly": "hg38", + "chrom": "chr17", + "pos": "44372407", + "ref": "C", + "alt": "T" + }, + "moleculeContext": "genomic" + } + } + } + ] + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/test_patient_and_cohort_creator.py b/tests/preprocessing/test_patient_and_cohort_creator.py index d4564bfea..5e5e84a26 100644 --- a/tests/preprocessing/test_patient_and_cohort_creator.py +++ b/tests/preprocessing/test_patient_and_cohort_creator.py @@ -1,4 +1,5 @@ import os +import io import hpotk import pytest @@ -92,3 +93,19 @@ def test_load_phenopacket( cohort_creator=phenopacket_cohort_creator, ) print(cohort) + + def test_cohort_creator( + self, + fpath_test_dir: str, + phenopacket_cohort_creator: CohortCreator, + ): + folder = os.path.join(fpath_test_dir, 'preprocessing', 'data', 'dup_id_test_data') + _, results = load_phenopacket_folder(folder, phenopacket_cohort_creator) + + outfile = io.StringIO() + results.summarize(outfile) + + actual_lines = outfile.getvalue().split(os.linesep) + + expected = " Patient ID/s Pat_1[PMID_12345], Pat_2[PMID_67890] have a duplicate. Please verify every patient has an unique ID." + assert expected in actual_lines