From 43edff3467896d4d1339173c4a62f72a3ff74d10 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 14 Jan 2025 15:53:23 +0100 Subject: [PATCH 1/5] Add an example translocation into a test phenopacket. --- .../PMID_30968594_individual_1.json | 23 +++++++++++++++++++ tests/test_data/phenopackets/README.md | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_data/phenopackets/PMID_30968594_individual_1.json b/tests/test_data/phenopackets/PMID_30968594_individual_1.json index 712366691..32650f033 100644 --- a/tests/test_data/phenopackets/PMID_30968594_individual_1.json +++ b/tests/test_data/phenopackets/PMID_30968594_individual_1.json @@ -266,6 +266,29 @@ } } } + }, + { + "subjectOrBiosampleId": "individual 1", + "interpretationStatus": "CAUSATIVE", + "variantInterpretation": { + "variationDescriptor": { + "id": "var_TerBgCxXbYQtjsIsIjzgpliJm", + "label": "translocation: t(9;15)(q34.1;q13) breakpoint in EHMT1", + "geneContext": { + "valueId": "HGNC:24650", + "symbol": "EHMT1" + }, + "moleculeContext": "genomic", + "structuralType": { + "id": "SO:1000044", + "label": "chromosomal_translocation" + }, + "allelicState": { + "id": "GENO:0000135", + "label": "heterozygous" + } + } + } } ] } diff --git a/tests/test_data/phenopackets/README.md b/tests/test_data/phenopackets/README.md index e4a58e595..5f0788ea7 100644 --- a/tests/test_data/phenopackets/README.md +++ b/tests/test_data/phenopackets/README.md @@ -5,4 +5,4 @@ The phenopackets used for testing. ## `PMID_30968594_individual_1.json` A phenopacket with individual characteristics, phenotype features, measurements, -interpretations, and diseases. +interpretations (a SNP, an imprecise SV, and a translocation), and diseases. From 3b0cb6710d67c0cbf907b44930642842066f1c40 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 14 Jan 2025 15:54:07 +0100 Subject: [PATCH 2/5] The fixtures' scope should be class. --- tests/preprocessing/test_phenopacket.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/preprocessing/test_phenopacket.py b/tests/preprocessing/test_phenopacket.py index 3e066efce..15afcc88e 100644 --- a/tests/preprocessing/test_phenopacket.py +++ b/tests/preprocessing/test_phenopacket.py @@ -138,7 +138,7 @@ def read_genomic_interpretation_json(fpath: str) -> GenomicInterpretation: class TestPhenopacketPatientCreator: - @pytest.fixture + @pytest.fixture(scope="class") def functional_annotator( self, fpath_project_dir: str, @@ -152,7 +152,7 @@ def functional_annotator( cache_dir=fpath_variant_cache_dir, ) - @pytest.fixture + @pytest.fixture(scope="class") def imprecise_sv_functional_annotator( self, genome_build: GenomeBuild, @@ -163,7 +163,7 @@ def imprecise_sv_functional_annotator( ), ) - @pytest.fixture + @pytest.fixture(scope="class") def variant_coordinate_finder( self, genome_build: GenomeBuild, @@ -176,7 +176,7 @@ def variant_coordinate_finder( def onset_term_parser(self) -> PhenopacketOntologyTermOnsetParser: return PhenopacketOntologyTermOnsetParser.default_parser() - @pytest.fixture + @pytest.fixture(scope="class") def patient_creator( self, hpo: hpotk.MinimalOntology, @@ -197,7 +197,7 @@ def patient_creator( term_onset_parser=onset_term_parser, ) - @pytest.fixture + @pytest.fixture(scope="class") def phenopacket( self, fpath_phenopacket_dir: str, From b18bb507ecbf33430d02c0635547fdf6bcbaae46 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 14 Jan 2025 16:14:40 +0100 Subject: [PATCH 3/5] Predict that an imprecise translocation leads to a transcript translocation effect. --- src/gpsea/model/_variant_effects.py | 2 ++ src/gpsea/preprocessing/_generic.py | 2 ++ tests/preprocessing/test_phenopacket.py | 34 +++++++++++++++++++++++++ 3 files changed, 38 insertions(+) diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py index 9567592d4..bbbb33e6b 100644 --- a/src/gpsea/model/_variant_effects.py +++ b/src/gpsea/model/_variant_effects.py @@ -25,6 +25,7 @@ class VariantEffect(enum.Enum): """ TRANSCRIPT_ABLATION = "SO:0001893" + TRANSCRIPT_TRANSLOCATION = "SO:0001883" SPLICE_ACCEPTOR_VARIANT = "SO:0001574" SPLICE_DONOR_VARIANT = "SO:0001575" STOP_GAINED = "SO:0001587" @@ -119,6 +120,7 @@ def __str__(self) -> str: effect_to_display = { VariantEffect.TRANSCRIPT_ABLATION: "transcript ablation", + VariantEffect.TRANSCRIPT_TRANSLOCATION: "transcript translocation", VariantEffect.SPLICE_ACCEPTOR_VARIANT: "splice acceptor", VariantEffect.SPLICE_DONOR_VARIANT: "splice donor", VariantEffect.STOP_GAINED: "stop gained", diff --git a/src/gpsea/preprocessing/_generic.py b/src/gpsea/preprocessing/_generic.py index 2cfe3d376..0c52e04a2 100644 --- a/src/gpsea/preprocessing/_generic.py +++ b/src/gpsea/preprocessing/_generic.py @@ -47,6 +47,8 @@ def _map_to_variant_effects( return (VariantEffect.TRANSCRIPT_ABLATION,) elif variant_class == VariantClass.DUP: return (VariantEffect.TRANSCRIPT_AMPLIFICATION,) + elif variant_class == VariantClass.TRANSLOCATION: + return (VariantEffect.TRANSCRIPT_TRANSLOCATION,) else: # This mapping is most likely incomplete. # Please open a ticket if support diff --git a/tests/preprocessing/test_phenopacket.py b/tests/preprocessing/test_phenopacket.py index 15afcc88e..5503bd065 100644 --- a/tests/preprocessing/test_phenopacket.py +++ b/tests/preprocessing/test_phenopacket.py @@ -11,6 +11,7 @@ from phenopackets.schema.v2.core.interpretation_pb2 import GenomicInterpretation from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket +from gpsea.model import VariantClass from gpsea.model.genome import GenomeBuild, Strand from gpsea.preprocessing import VVHgvsVariantCoordinateFinder @@ -299,6 +300,39 @@ def test_phenopacket_patient_creator( assert disease.onset.is_postnatal is True assert disease.onset.days == pytest.approx(20.) + # variants + assert len(patient.variants) == 3 + snp = patient.variants[0] + assert snp.variant_info.has_variant_coordinates() + snp_vc = snp.variant_info.variant_coordinates + assert snp_vc is not None + assert snp_vc.chrom == "6" + assert snp_vc.start == 32_040_420 + assert snp_vc.end == 32_040_421 + assert snp_vc.ref == "C" + assert snp_vc.alt == "T" + assert snp_vc.change_length == 0 + assert snp_vc.variant_class == VariantClass.SNV + + imprecise_sv = patient.variants[1] + assert imprecise_sv.variant_info.has_sv_info() + sv_vi = imprecise_sv.variant_info.sv_info + assert sv_vi is not None + assert sv_vi.gene_id == "HGNC:2600" + assert sv_vi.gene_symbol == "CYP21A2" + assert sv_vi.structural_type.value == "SO:1000029" # `chromosomal_deletion` + assert sv_vi.variant_class == VariantClass.DEL + + imprecise_tra = patient.variants[2] + assert imprecise_tra.variant_info.has_sv_info() + tra_vi = imprecise_tra.variant_info.sv_info + assert tra_vi is not None + assert tra_vi.gene_id == "HGNC:24650" + assert tra_vi.gene_symbol == "EHMT1" + assert tra_vi.structural_type.value == "SO:1000044" # `chromosomal_translocation` + assert tra_vi.variant_class == VariantClass.TRANSLOCATION + + def test_individual_with_no_genotype( self, phenopacket: Phenopacket, From d571a57f8b2bfe7bd59f93e094b0edd3716c80f3 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 14 Jan 2025 16:33:54 +0100 Subject: [PATCH 4/5] Reorder `VariantEffect` enum members. --- src/gpsea/model/_variant_effects.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gpsea/model/_variant_effects.py b/src/gpsea/model/_variant_effects.py index bbbb33e6b..082e54ae3 100644 --- a/src/gpsea/model/_variant_effects.py +++ b/src/gpsea/model/_variant_effects.py @@ -24,6 +24,7 @@ class VariantEffect(enum.Enum): 'SO:0001583' """ + TRANSCRIPT_AMPLIFICATION = "SO:0001889" TRANSCRIPT_ABLATION = "SO:0001893" TRANSCRIPT_TRANSLOCATION = "SO:0001883" SPLICE_ACCEPTOR_VARIANT = "SO:0001574" @@ -32,7 +33,6 @@ class VariantEffect(enum.Enum): FRAMESHIFT_VARIANT = "SO:0001589" STOP_LOST = "SO:0001578" START_LOST = "SO:0002012" - TRANSCRIPT_AMPLIFICATION = "SO:0001889" INFRAME_INSERTION = "SO:0001821" INFRAME_DELETION = "SO:0001822" MISSENSE_VARIANT = "SO:0001583" @@ -119,6 +119,7 @@ def __str__(self) -> str: effect_to_display = { + VariantEffect.TRANSCRIPT_AMPLIFICATION: "transcript amplification", VariantEffect.TRANSCRIPT_ABLATION: "transcript ablation", VariantEffect.TRANSCRIPT_TRANSLOCATION: "transcript translocation", VariantEffect.SPLICE_ACCEPTOR_VARIANT: "splice acceptor", @@ -127,7 +128,6 @@ def __str__(self) -> str: VariantEffect.FRAMESHIFT_VARIANT: "frameshift", VariantEffect.STOP_LOST: "stop lost", VariantEffect.START_LOST: "start lost", - VariantEffect.TRANSCRIPT_AMPLIFICATION: "transcript amplification", VariantEffect.INFRAME_INSERTION: "inframe insertion", VariantEffect.INFRAME_DELETION: "inframe deletion", VariantEffect.MISSENSE_VARIANT: "missense", From 5b6c81b9e2b834ea540b3c7a760a8ebab440b816 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Tue, 14 Jan 2025 16:36:11 +0100 Subject: [PATCH 5/5] Show the transcript translocation in the docs. --- docs/tutorial.rst | 4 ++++ .../genotype/variant_predicates.rst | 23 +++++++++++-------- docs/user-guide/analyses/phenotype-scores.rst | 6 +++-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index cf6dc0551..4db660626 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -273,10 +273,14 @@ depending on presence of a single allele of a missense or truncating variant >>> from gpsea.analysis.clf import monoallelic_classifier >>> is_missense = variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id) >>> truncating_effects = ( +... VariantEffect.TRANSCRIPT_ABLATION, +... VariantEffect.TRANSCRIPT_TRANSLOCATION, ... VariantEffect.FRAMESHIFT_VARIANT, +... VariantEffect.START_LOST, ... VariantEffect.STOP_GAINED, ... VariantEffect.SPLICE_DONOR_VARIANT, ... VariantEffect.SPLICE_ACCEPTOR_VARIANT, +... # more effects could be listed here ... ... ) >>> is_truncating = anyof(variant_effect(e, tx_id) for e in truncating_effects) >>> gt_clf = monoallelic_classifier( diff --git a/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst b/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst index 588d3da27..4fbca7dfe 100644 --- a/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst +++ b/docs/user-guide/analyses/partitioning/genotype/variant_predicates.rst @@ -26,13 +26,11 @@ The predicates operate on several lines of information: +------------------------+-------------------------------------------------------------------------------------------------+ | Protein data | variant is located in a region encoding a protein domain, protein feature type | +------------------------+-------------------------------------------------------------------------------------------------+ -| Genome | overlap with a genomic region of interest | -+------------------------+-------------------------------------------------------------------------------------------------+ The scope of the builtin predicates is fairly narrow and likely insufficient for real-life analyses. -However, the predicates can be chained into a compound predicate +However, several predicates can be "chained" into a compound predicate using a boolean logic, to achive more expressivity for testing complex conditions, such as "variant is a missense or synonymous variant located in exon 6 of `NM_013275.6`". @@ -41,8 +39,9 @@ such as "variant is a missense or synonymous variant located in exon 6 of `NM_01 Examples ******** -Here we show examples of several simple variant predicates and -how to chain them for testing complex conditions. +Here we show how to use the builtin predicates for simple tests +and how to build a compound predicate from the builtin predicates, +for testing complex conditions. Load cohort @@ -112,10 +111,10 @@ See the :mod:`gpsea.analysis.predicate` module for a complete list of the builtin predicates. -Predicate chain -=============== +Compound predicates +=================== -Using the builtin predicates, we can build a logical chain to test complex conditions. +A compound predicate for testing complex conditions can be built from two or more predicates. For instance, we can test if the variant meets any of several conditions: >>> import gpsea.analysis.predicate as vp @@ -130,7 +129,13 @@ or *all* conditions: >>> missense_and_exon20.test(variant) True -All variant predicates overload Python ``&`` (AND) and ``|`` (OR) operators, to allow chaining. +All variant predicates overload Python ``&`` (AND) and ``|`` (OR) operators, +to combine a predicate pair into a compound predicate. + +.. note:: + + Combining three or or more predicates can be achieved with :func:`~gpsea.analysis.allof` + and :func:`~gpsea.analysis.anyof` functions. Therefore, there is nothing that prevents us to combine the predicates into multi-level tests, e.g. to test if the variant is a *"chromosomal deletion" or a deletion which removes at least 50 bp*: diff --git a/docs/user-guide/analyses/phenotype-scores.rst b/docs/user-guide/analyses/phenotype-scores.rst index 85347ed96..5390e3ff0 100644 --- a/docs/user-guide/analyses/phenotype-scores.rst +++ b/docs/user-guide/analyses/phenotype-scores.rst @@ -121,9 +121,11 @@ In this example, the point mutation is a mutation that meets the following condi '((change length == 0 AND reference allele length == 1) AND MISSENSE_VARIANT on NM_001042681.2)' -For the loss of function predicate, the following variant effects are considered loss of function: +For the loss-of-function predicate, the following is a non-exhausting list +of variant effects considered as a loss-of-function: >>> lof_effects = ( +... VariantEffect.TRANSCRIPT_TRANSLOCATION, ... VariantEffect.TRANSCRIPT_ABLATION, ... VariantEffect.FRAMESHIFT_VARIANT, ... VariantEffect.START_LOST, @@ -131,7 +133,7 @@ For the loss of function predicate, the following variant effects are considered ... ) >>> lof_mutation = anyof(variant_effect(eff, tx_id) for eff in lof_effects) >>> lof_mutation.description -'(TRANSCRIPT_ABLATION on NM_001042681.2 OR FRAMESHIFT_VARIANT on NM_001042681.2 OR START_LOST on NM_001042681.2 OR STOP_GAINED on NM_001042681.2)' +'(TRANSCRIPT_TRANSLOCATION on NM_001042681.2 OR TRANSCRIPT_ABLATION on NM_001042681.2 OR FRAMESHIFT_VARIANT on NM_001042681.2 OR START_LOST on NM_001042681.2 OR STOP_GAINED on NM_001042681.2)' The genotype predicate will bin the patient into two classes: a point mutation or the loss of function: