Skip to content

Commit

Permalink
Merge pull request #238 from monarch-initiative/duplicate_patient_id
Browse files Browse the repository at this point in the history
Duplicate patient
  • Loading branch information
ielis authored Sep 4, 2024
2 parents cf42c4e + 7ecd5c3 commit 61e92ad
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 0 deletions.
10 changes: 10 additions & 0 deletions src/gpsea/preprocessing/_patient.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,23 @@ def __init__(self, patient_creator: PatientCreator[T]):

def process(self, inputs: typing.Iterable[T], notepad: Notepad) -> Cohort:
patients = []
patient_labels = set()
duplicate_pat_labels = set()

for i, pp in enumerate(inputs):
sub = notepad.add_subsection(f'patient #{i}')
patient = self._pc.process(pp, sub)
if patient.labels in patient_labels:
duplicate_pat_labels.add(patient.labels)
patient_labels.add(patient.labels)
patients.append(patient)

# What happens if a sample has
if len(duplicate_pat_labels) > 0:
label_summaries = [d.label_summary() for d in duplicate_pat_labels]
label_summaries.sort()
notepad.add_error(f"Patient ID/s {', '.join(label_summaries)} have a duplicate",
"Please verify every patient has an unique ID.")

# We should have >1 patients in the cohort, right?
if len(patients) <= 1:
Expand Down
54 changes: 54 additions & 0 deletions tests/preprocessing/data/dup_id_test_data/pp1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"id": "PMID_12345",
"subject": {
"id": "Pat_1"
},
"phenotypicFeatures": [
{
"type": {
"id": "HP:5200338",
"label": "Excessive fire setting"
}
}
],
"interpretations": [
{
"id": "Pat_1",
"progressStatus": "SOLVED",
"diagnosis": {
"genomicInterpretations": [
{
"subjectOrBiosampleId": "Pat_1",
"interpretationStatus": "CAUSATIVE",
"variantInterpretation": {
"variationDescriptor": {
"geneContext": {
"valueId": "HGNC:6138",
"symbol": "ITGA2B"
},
"expressions": [
{
"syntax": "hgvs.c",
"value": "NM_000419.3:c.3077G>A"
},
{
"syntax": "hgvs.g",
"value": "NC_000017.11:g.44372407C>T"
}
],
"vcfRecord": {
"genomeAssembly": "hg38",
"chrom": "chr17",
"pos": "44372407",
"ref": "C",
"alt": "T"
},
"moleculeContext": "genomic"
}
}
}
]
}
}
]
}
54 changes: 54 additions & 0 deletions tests/preprocessing/data/dup_id_test_data/pp2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"id": "PMID_12345",
"subject": {
"id": "Pat_1"
},
"phenotypicFeatures": [
{
"type": {
"id": "HP:5200338",
"label": "Excessive fire setting"
}
}
],
"interpretations": [
{
"id": "Pat_1",
"progressStatus": "SOLVED",
"diagnosis": {
"genomicInterpretations": [
{
"subjectOrBiosampleId": "Pat_1",
"interpretationStatus": "CAUSATIVE",
"variantInterpretation": {
"variationDescriptor": {
"geneContext": {
"valueId": "HGNC:6138",
"symbol": "ITGA2B"
},
"expressions": [
{
"syntax": "hgvs.c",
"value": "NM_000419.3:c.3077G>A"
},
{
"syntax": "hgvs.g",
"value": "NC_000017.11:g.44372407C>T"
}
],
"vcfRecord": {
"genomeAssembly": "hg38",
"chrom": "chr17",
"pos": "44372407",
"ref": "C",
"alt": "T"
},
"moleculeContext": "genomic"
}
}
}
]
}
}
]
}
54 changes: 54 additions & 0 deletions tests/preprocessing/data/dup_id_test_data/pp3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"id": "PMID_67890",
"subject": {
"id": "Pat_2"
},
"phenotypicFeatures": [
{
"type": {
"id": "HP:5200338",
"label": "Excessive fire setting"
}
}
],
"interpretations": [
{
"id": "Pat_2",
"progressStatus": "SOLVED",
"diagnosis": {
"genomicInterpretations": [
{
"subjectOrBiosampleId": "Pat_2",
"interpretationStatus": "CAUSATIVE",
"variantInterpretation": {
"variationDescriptor": {
"geneContext": {
"valueId": "HGNC:6138",
"symbol": "ITGA2B"
},
"expressions": [
{
"syntax": "hgvs.c",
"value": "NM_000419.3:c.3077G>A"
},
{
"syntax": "hgvs.g",
"value": "NC_000017.11:g.44372407C>T"
}
],
"vcfRecord": {
"genomeAssembly": "hg38",
"chrom": "chr17",
"pos": "44372407",
"ref": "C",
"alt": "T"
},
"moleculeContext": "genomic"
}
}
}
]
}
}
]
}
54 changes: 54 additions & 0 deletions tests/preprocessing/data/dup_id_test_data/pp4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"id": "PMID_67890",
"subject": {
"id": "Pat_2"
},
"phenotypicFeatures": [
{
"type": {
"id": "HP:5200338",
"label": "Excessive fire setting"
}
}
],
"interpretations": [
{
"id": "Pat_2",
"progressStatus": "SOLVED",
"diagnosis": {
"genomicInterpretations": [
{
"subjectOrBiosampleId": "Pat_2",
"interpretationStatus": "CAUSATIVE",
"variantInterpretation": {
"variationDescriptor": {
"geneContext": {
"valueId": "HGNC:6138",
"symbol": "ITGA2B"
},
"expressions": [
{
"syntax": "hgvs.c",
"value": "NM_000419.3:c.3077G>A"
},
{
"syntax": "hgvs.g",
"value": "NC_000017.11:g.44372407C>T"
}
],
"vcfRecord": {
"genomeAssembly": "hg38",
"chrom": "chr17",
"pos": "44372407",
"ref": "C",
"alt": "T"
},
"moleculeContext": "genomic"
}
}
}
]
}
}
]
}
17 changes: 17 additions & 0 deletions tests/preprocessing/test_patient_and_cohort_creator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import io

import hpotk
import pytest
Expand Down Expand Up @@ -92,3 +93,19 @@ def test_load_phenopacket(
cohort_creator=phenopacket_cohort_creator,
)
print(cohort)

def test_cohort_creator(
self,
fpath_test_dir: str,
phenopacket_cohort_creator: CohortCreator,
):
folder = os.path.join(fpath_test_dir, 'preprocessing', 'data', 'dup_id_test_data')
_, results = load_phenopacket_folder(folder, phenopacket_cohort_creator)

outfile = io.StringIO()
results.summarize(outfile)

actual_lines = outfile.getvalue().split(os.linesep)

expected = " Patient ID/s Pat_1[PMID_12345], Pat_2[PMID_67890] have a duplicate. Please verify every patient has an unique ID."
assert expected in actual_lines

0 comments on commit 61e92ad

Please sign in to comment.