diff --git a/docs/img/tutorial/tbx5_protein_diagram.png b/docs/img/tutorial/tbx5_protein_diagram.png index 10422c9ec..12c969ca4 100644 Binary files a/docs/img/tutorial/tbx5_protein_diagram.png and b/docs/img/tutorial/tbx5_protein_diagram.png differ diff --git a/docs/report/tbx5_truncating_vs_missense.csv b/docs/report/tbx5_truncating_vs_missense.csv index 5bcf06e67..4465a91de 100644 --- a/docs/report/tbx5_truncating_vs_missense.csv +++ b/docs/report/tbx5_truncating_vs_missense.csv @@ -1,19 +1,18 @@ -Allele group,Missense,Missense,Truncating,Truncating,, -,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],31/60,52%,29/29,100%,9.550859422122477e-06,5.618152601248516e-07 -Hypoplasia of the radius [HP:0002984],30/62,48%,10/27,37%,1.0,0.3614379675325876 -Atrial septal defect [HP:0001631],42/44,95%,38/38,100%,1.0,0.49653718759409815 -Secundum atrial septal defect [HP:0001684],14/35,40%,13/40,32%,1.0,0.6304400561799244 -Abnormal cardiac septum morphology [HP:0001671],62/62,100%,50/50,100%,1.0,1.0 -Abnormal hand morphology [HP:0005922],53/53,100%,41/41,100%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],43/43,100%,38/38,100%,1.0,1.0 -Abnormal cardiac atrium morphology [HP:0005120],43/43,100%,38/38,100%,1.0,1.0 -Abnormal appendicular skeleton morphology [HP:0011844],64/64,100%,60/60,100%,1.0,1.0 -Aplasia/hypoplasia of the extremities [HP:0009815],55/55,100%,44/44,100%,1.0,1.0 -Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56,100%,45/45,100%,1.0,1.0 -Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55,100%,44/44,100%,1.0,1.0 -Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55,100%,44/44,100%,1.0,1.0 -Abnormal long bone morphology [HP:0011314],44/44,100%,19/19,100%,1.0,1.0 -Abnormal finger morphology [HP:0001167],36/36,100%,56/56,100%,1.0,1.0 -Abnormal digit morphology [HP:0011297],38/38,100%,59/59,100%,1.0,1.0 -Abnormal thumb morphology [HP:0001172],30/30,100%,56/56,100%,1.0,1.0 +,Missense,Truncating,Corrected p values,p values +Ventricular septal defect [HP:0001629],31/60 (52%),29/29 (100%),9.550859422122477e-06,5.618152601248516e-07 +Hypoplasia of the radius [HP:0002984],30/62 (48%),10/27 (37%),1.0,0.3614379675325876 +Atrial septal defect [HP:0001631],42/44 (95%),38/38 (100%),1.0,0.49653718759409815 +Secundum atrial septal defect [HP:0001684],14/35 (40%),13/40 (32%),1.0,0.6304400561799244 +Abnormal thumb morphology [HP:0001172],30/30 (100%),56/56 (100%),1.0,1.0 +Abnormal finger morphology [HP:0001167],36/36 (100%),56/56 (100%),1.0,1.0 +Abnormal digit morphology [HP:0011297],38/38 (100%),59/59 (100%),1.0,1.0 +Abnormal atrial septum morphology [HP:0011994],43/43 (100%),38/38 (100%),1.0,1.0 +Abnormal cardiac atrium morphology [HP:0005120],43/43 (100%),38/38 (100%),1.0,1.0 +Abnormal long bone morphology [HP:0011314],44/44 (100%),19/19 (100%),1.0,1.0 +Abnormal hand morphology [HP:0005922],53/53 (100%),41/41 (100%),1.0,1.0 +Aplasia/hypoplasia of the extremities [HP:0009815],55/55 (100%),44/44 (100%),1.0,1.0 +Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],55/55 (100%),44/44 (100%),1.0,1.0 +Aplasia/hypoplasia involving bones of the extremities [HP:0045060],55/55 (100%),44/44 (100%),1.0,1.0 +Aplasia/hypoplasia involving the skeleton [HP:0009115],56/56 (100%),45/45 (100%),1.0,1.0 +Abnormal cardiac septum morphology [HP:0001671],62/62 (100%),50/50 (100%),1.0,1.0 +Abnormal appendicular skeleton morphology [HP:0011844],64/64 (100%),60/60 (100%),1.0,1.0 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 4db660626..c25f7263c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -166,32 +166,17 @@ the most common HPO terms, variants, diseases, and variant effects: Plot distribution of variants with respect to the protein sequence ------------------------------------------------------------------ -We can also show the distribution of variants with respect to the encoded protein. -We first obtain ``tx_coordinates`` (:class:`~gpsea.model.TranscriptCoordinates`) -with genomic coordinates of the transcript, including e.g. untranslated regions or exons: +We can use :class:`~gpsea.view.CohortArtist` to plot the distribution of variants +with respect to the encoded protein on +a Matplotlib `Axes `_: ->>> from gpsea.preprocessing import configure_default_tx_coordinate_service ->>> tx_service = configure_default_tx_coordinate_service(genome_build="GRCh38.p13") ->>> tx_coordinates = tx_service.fetch(tx_id) - - -and we also get ``protein_meta`` (:class:`~gpsea.model.ProteinMetadata`) -with the domains and regions of the encoded protein: - ->>> from gpsea.preprocessing import configure_default_protein_metadata_service ->>> pms = configure_default_protein_metadata_service() ->>> protein_meta = pms.annotate(px_id) - -Now we can plot a diagram of the mutations on the protein: - ->>> from gpsea.view import ProteinVisualizer >>> import matplotlib.pyplot as plt +>>> from gpsea.view import configure_default_cohort_artist +>>> cohort_artist = configure_default_cohort_artist() >>> fig, ax = plt.subplots(figsize=(15, 8)) ->>> visualizer = ProteinVisualizer() ->>> visualizer.draw_protein_diagram( -... tx_coordinates, -... protein_meta, -... cohort, +>>> cohort_artist.draw_protein( +... cohort=cohort, +... protein_id=px_id, ... ax=ax, ... ) diff --git a/docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png b/docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png index 86e0d3a48..b09d1c5c5 100644 Binary files a/docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png and b/docs/user-guide/analyses/report/rere_phenotype_score_boxplot.png differ diff --git a/docs/user-guide/analyses/report/tbx5_frameshift.csv b/docs/user-guide/analyses/report/tbx5_frameshift.csv index a9c035a62..ae05fc331 100644 --- a/docs/user-guide/analyses/report/tbx5_frameshift.csv +++ b/docs/user-guide/analyses/report/tbx5_frameshift.csv @@ -1,26 +1,25 @@ -Allele group,Frameshift,Frameshift,Other,Other,, -,Count,Percent,Count,Percent,Corrected p values,p values -Ventricular septal defect [HP:0001629],19/19,100%,42/71,59%,0.005806240832840839,0.00024192670136836828 -Absent thumb [HP:0009777],14/31,45%,18/100,18%,0.04456405819223913,0.0037136715160199273 -Secundum atrial septal defect [HP:0001684],4/22,18%,23/55,42%,0.4065940176561687,0.06544319142266644 -Triphalangeal thumb [HP:0001199],13/32,41%,23/99,23%,0.4065940176561687,0.06932119159387057 -Muscular ventricular septal defect [HP:0011623],6/25,24%,8/84,10%,0.4065940176561687,0.08470708701170182 -Short thumb [HP:0009778],8/30,27%,25/69,36%,1.0,0.4870099714553749 -Absent radius [HP:0003974],6/25,24%,9/43,21%,1.0,0.7703831604944444 -Atrial septal defect [HP:0001631],20/20,100%,63/65,97%,1.0,1.0 -Abnormal atrial septum morphology [HP:0011994],20/20,100%,64/64,100%,1.0,1.0 -Abnormal cardiac septum morphology [HP:0001671],28/28,100%,89/89,100%,1.0,1.0 -Abnormal cardiac atrium morphology [HP:0005120],20/20,100%,64/64,100%,1.0,1.0 -Hypoplasia of the radius [HP:0002984],6/14,43%,34/75,45%,1.0,1.0 -Abnormal appendicular skeleton morphology [HP:0011844],34/34,100%,93/93,100%,1.0,1.0 -Aplasia/hypoplasia of the extremities [HP:0009815],22/22,100%,78/78,100%,1.0,1.0 -Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23,100%,80/80,100%,1.0,1.0 -Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22,100%,78/78,100%,1.0,1.0 -Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22,100%,78/78,100%,1.0,1.0 -Abnormal long bone morphology [HP:0011314],13/13,100%,50/50,100%,1.0,1.0 -Abnormal hand morphology [HP:0005922],20/20,100%,75/75,100%,1.0,1.0 -Abnormal thumb morphology [HP:0001172],31/31,100%,58/58,100%,1.0,1.0 -Abnormal finger morphology [HP:0001167],31/31,100%,64/64,100%,1.0,1.0 -Abnormal digit morphology [HP:0011297],33/33,100%,67/67,100%,1.0,1.0 -Aplasia/Hypoplasia of fingers [HP:0006265],19/19,100%,44/44,100%,1.0,1.0 -Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19,100%,44/44,100%,1.0,1.0 +,Frameshift,Other,Corrected p values,p values +Ventricular septal defect [HP:0001629],19/19 (100%),42/71 (59%),0.005806240832840839,0.00024192670136836828 +Absent thumb [HP:0009777],14/31 (45%),18/100 (18%),0.04456405819223913,0.0037136715160199273 +Secundum atrial septal defect [HP:0001684],4/22 (18%),23/55 (42%),0.4065940176561687,0.06544319142266644 +Triphalangeal thumb [HP:0001199],13/32 (41%),23/99 (23%),0.4065940176561687,0.06932119159387057 +Muscular ventricular septal defect [HP:0011623],6/25 (24%),8/84 (10%),0.4065940176561687,0.08470708701170182 +Short thumb [HP:0009778],8/30 (27%),25/69 (36%),1.0,0.4870099714553749 +Absent radius [HP:0003974],6/25 (24%),9/43 (21%),1.0,0.7703831604944444 +Abnormal long bone morphology [HP:0011314],13/13 (100%),50/50 (100%),1.0,1.0 +Aplasia/Hypoplasia of fingers [HP:0006265],19/19 (100%),44/44 (100%),1.0,1.0 +Aplasia/hypoplasia involving bones of the hand [HP:0005927],19/19 (100%),44/44 (100%),1.0,1.0 +Atrial septal defect [HP:0001631],20/20 (100%),63/65 (97%),1.0,1.0 +Abnormal atrial septum morphology [HP:0011994],20/20 (100%),64/64 (100%),1.0,1.0 +Abnormal cardiac atrium morphology [HP:0005120],20/20 (100%),64/64 (100%),1.0,1.0 +Abnormal hand morphology [HP:0005922],20/20 (100%),75/75 (100%),1.0,1.0 +Aplasia/hypoplasia of the extremities [HP:0009815],22/22 (100%),78/78 (100%),1.0,1.0 +Aplasia/hypoplasia involving bones of the upper limbs [HP:0006496],22/22 (100%),78/78 (100%),1.0,1.0 +Aplasia/hypoplasia involving bones of the extremities [HP:0045060],22/22 (100%),78/78 (100%),1.0,1.0 +Aplasia/hypoplasia involving the skeleton [HP:0009115],23/23 (100%),80/80 (100%),1.0,1.0 +Abnormal cardiac septum morphology [HP:0001671],28/28 (100%),89/89 (100%),1.0,1.0 +Abnormal thumb morphology [HP:0001172],31/31 (100%),58/58 (100%),1.0,1.0 +Abnormal finger morphology [HP:0001167],31/31 (100%),64/64 (100%),1.0,1.0 +Abnormal digit morphology [HP:0011297],33/33 (100%),67/67 (100%),1.0,1.0 +Abnormal appendicular skeleton morphology [HP:0011844],34/34 (100%),93/93 (100%),1.0,1.0 +Hypoplasia of the radius [HP:0002984],6/14 (43%),34/75 (45%),1.0,1.0 diff --git a/docs/user-guide/analyses/report/umod_km_curves.png b/docs/user-guide/analyses/report/umod_km_curves.png index c622f192e..b83596851 100644 Binary files a/docs/user-guide/analyses/report/umod_km_curves.png and b/docs/user-guide/analyses/report/umod_km_curves.png differ diff --git a/docs/user-guide/exploratory.rst b/docs/user-guide/exploratory.rst index a1f43ba84..ad9af70db 100644 --- a/docs/user-guide/exploratory.rst +++ b/docs/user-guide/exploratory.rst @@ -78,11 +78,11 @@ Then we choose the transcript and protein identifiers, and we fetch the correspo >>> from gpsea.preprocessing import configure_default_tx_coordinate_service, configure_default_protein_metadata_service >>> tx_id = "NM_181486.4" ->>> pt_id = "NP_852259.1" +>>> px_id = "NP_852259.1" >>> tx_service = configure_default_tx_coordinate_service(genome_build="GRCh38.p13") >>> tx_coordinates = tx_service.fetch(tx_id) >>> pm_service = configure_default_protein_metadata_service() ->>> protein_meta = pm_service.annotate(pt_id) +>>> protein_meta = pm_service.annotate(px_id) Last, we load HPO `v2024-07-01` to use in the exploratory analysis: @@ -164,20 +164,59 @@ with variants in the *TBX5* gene: Plot distribution of variants with respect to the protein sequence ------------------------------------------------------------------ -We use Matplotlib to plot the distribution of variants on a protein diagram: +Cohort artist +^^^^^^^^^^^^^ + +The simplest way to plot the variant distribution is to use the :class:`~gpsea.view.CohortArtist` API: >>> import matplotlib.pyplot as plt +>>> from gpsea.view import configure_default_cohort_artist +>>> cohort_artist = configure_default_cohort_artist() +>>> fig, ax = plt.subplots(figsize=(15, 8)) +>>> cohort_artist.draw_protein( +... cohort=cohort, +... protein_id=px_id, +... ax=ax, +... ) + + +.. image:: img/TBX5_protein_diagram.from_artist.png + :alt: TBX5 protein diagram + :align: center + :width: 600px + +.. doctest:: exploratory + :hide: + + >>> if _overwrite: + ... fig.tight_layout() + ... fig.savefig('docs/user-guide/img/TBX5_protein_diagram.from_artist.png') + +The :func:`~gpsea.view.configure_default_cohort_artist` function gets the default artist +which we use to plot the diagram with the variant distribution across the protein sequence +on Matplotlib axes. + + +Protein visualizer +^^^^^^^^^^^^^^^^^^ + +Sometimes, however, things do not work out-of-the-box, e.g. because protein metadata +is not available from Uniprot (the default), and we may need to use the lower-level components. + +The :class:`~gpsea.view.ProteinVisualizer` takes cohort and the protein metadata +to plot the distribution of variants on a protein diagram: + >>> from gpsea.view import ProteinVisualizer >>> fig, ax = plt.subplots(figsize=(15, 8)) >>> visualizer = ProteinVisualizer() ->>> visualizer.draw_protein_diagram( -... tx_coordinates, -... protein_meta, -... cohort, +>>> visualizer.draw_protein( +... cohort=cohort, +... protein_metadata=protein_meta, ... ax=ax, ... ) -.. image:: img/TBX5_protein_diagram.png + +.. image:: img/TBX5_protein_diagram.from_protein_visualizer.png :alt: TBX5 protein diagram :align: center :width: 600px @@ -187,5 +226,4 @@ We use Matplotlib to plot the distribution of variants on a protein diagram: >>> if _overwrite: ... fig.tight_layout() - ... fig.savefig('docs/user-guide/img/TBX5_protein_diagram.png') - + ... fig.savefig('docs/user-guide/img/TBX5_protein_diagram.from_protein_visualizer.png') diff --git a/docs/user-guide/img/TBX5_protein_diagram.from_artist.png b/docs/user-guide/img/TBX5_protein_diagram.from_artist.png new file mode 100644 index 000000000..12c969ca4 Binary files /dev/null and b/docs/user-guide/img/TBX5_protein_diagram.from_artist.png differ diff --git a/docs/user-guide/img/TBX5_protein_diagram.from_protein_visualizer.png b/docs/user-guide/img/TBX5_protein_diagram.from_protein_visualizer.png new file mode 100644 index 000000000..12c969ca4 Binary files /dev/null and b/docs/user-guide/img/TBX5_protein_diagram.from_protein_visualizer.png differ diff --git a/docs/user-guide/img/TBX5_protein_diagram.png b/docs/user-guide/img/TBX5_protein_diagram.png deleted file mode 100644 index 10422c9ec..000000000 Binary files a/docs/user-guide/img/TBX5_protein_diagram.png and /dev/null differ diff --git a/src/gpsea/analysis/clf/_gt_classifiers.py b/src/gpsea/analysis/clf/_gt_classifiers.py index c89bc4895..991aa8151 100644 --- a/src/gpsea/analysis/clf/_gt_classifiers.py +++ b/src/gpsea/analysis/clf/_gt_classifiers.py @@ -249,7 +249,7 @@ def monoallelic_classifier( a_predicate: VariantPredicate, b_predicate: typing.Optional[VariantPredicate] = None, a_label: str = "A", - b_label: str = "B", + b_label: typing.Optional[str] = None, ) -> GenotypeClassifier: """ Monoallelic classifier bins patient into one of two groups, `A` and `B`, @@ -260,17 +260,20 @@ def monoallelic_classifier( :param a_predicate: predicate to test if the variants meet the criteria of the first group (named `A` by default). - :param b_predicate: predicate to test if the variants - meet the criteria of the second group or `None` - if the inverse of `a_predicate` should be used (named `B` by default). + :param b_predicate: predicate to test if the variants meet + the criteria of the second group or `None` if the complement + of the `a_predicate` should be used (named ``A^C`` by default). :param a_label: display name of the `a_predicate` (default ``"A"``). - :param b_label: display name of the `b_predicate` (default ``"B"``). + :param b_label: display name of the `b_predicate`. + If `b_label` is not provided, then set to ``"{a_label}^C"`` (e.g. ``A^C`` if ``a_label=A``). """ assert isinstance(a_label, str) - assert isinstance(b_label, str) - - if b_predicate is None: - b_predicate = ~a_predicate + a_predicate, b_predicate, b_label = _validate_b_predicate( + a_predicate=a_predicate, + b_predicate=b_predicate, + a_label=a_label, + b_label=b_label, + ) return PolyCountingGenotypeClassifier.monoallelic( a_predicate=a_predicate, @@ -284,7 +287,7 @@ def biallelic_classifier( a_predicate: VariantPredicate, b_predicate: typing.Optional[VariantPredicate] = None, a_label: str = "A", - b_label: str = "B", + b_label: typing.Optional[str] = None, partitions: typing.Collection[typing.Union[int, typing.Collection[int]]] = ( 0, 1, @@ -302,22 +305,25 @@ def biallelic_classifier( :param a_predicate: predicate to test if the variants meet the criteria of the first group (named `A` by default). :param b_predicate: predicate to test if the variants meet - the criteria of the second group or `None` if an inverse - of `a_predicate` should be used (named `B` by default). + the criteria of the second group or `None` if the complement + of the `a_predicate` should be used (named ``A^C`` by default). :param a_label: display name of the `a_predicate` (default ``"A"``). - :param b_label: display name of the `b_predicate` (default ``"B"``). + :param b_label: display name of the `b_predicate`. + If `b_label` is not provided, then set to ``"{a_label}^C"`` (e.g. ``A^C`` if ``a_label=A``). :param partitions: a sequence with partition identifiers (default ``(0, 1, 2)``). """ # Q/C assert isinstance(a_label, str) - assert isinstance(b_label, str) - + a_predicate, b_predicate, b_label = _validate_b_predicate( + a_predicate=a_predicate, + b_predicate=b_predicate, + a_label=a_label, + b_label=b_label, + ) + partitions = _fixate_partitions(partitions) _qc_partitions(partitions) - if b_predicate is None: - b_predicate = ~a_predicate - return PolyCountingGenotypeClassifier.biallelic( a_predicate=a_predicate, b_predicate=b_predicate, @@ -326,6 +332,28 @@ def biallelic_classifier( partitions=partitions, ) +def _validate_b_predicate( + a_predicate: VariantPredicate, + b_predicate: typing.Optional[VariantPredicate], + a_label: str, + b_label: typing.Optional[str], +) -> typing.Tuple[ + VariantPredicate, VariantPredicate, str, +]: + if b_predicate is None: + b_predicate = ~a_predicate + if b_label is None: + # Using a regular uppercase `C` instead of Unicode complement (`∁`) + # to reduce the 😕 factor. + b_label = f"{a_label}^C" # complement of A + else: + assert isinstance(b_label, str) + else: + if b_label is None: + b_label = f"{a_label}^C" # complement of A + + return a_predicate, b_predicate, b_label + def _build_ac_to_cat( partitions: typing.Collection[typing.Collection[int]], diff --git a/src/gpsea/view/__init__.py b/src/gpsea/view/__init__.py index 6e353d5f6..2b079ce35 100644 --- a/src/gpsea/view/__init__.py +++ b/src/gpsea/view/__init__.py @@ -1,6 +1,9 @@ +from ._base import GpseaReport, BaseViewer, BaseProteinVisualizer, CohortArtist +from ._config import configure_default_protein_visualizer, configure_default_cohort_artist +from ._formatter import Formatter, VariantFormatter from ._phenotype_analysis import summarize_hpo_analysis from ._protein_visualizable import ProteinVisualizable -from ._base import GpseaReport, BaseViewer +from ._protein_visualizer import ProteinVisualizer from ._txp import VariantTranscriptVisualizer from ._viewers import ( CohortVariantViewer, @@ -9,13 +12,15 @@ MtcStatsViewer, ProteinVariantViewer, ) -from ._protein_visualizer import ProteinVisualizer -from ._formatter import Formatter, VariantFormatter __all__ = [ "GpseaReport", "CohortVariantViewer", "CohortViewer", + "BaseProteinVisualizer", + "configure_default_protein_visualizer", + "CohortArtist", + "configure_default_cohort_artist", "ProteinVisualizer", "ProteinVisualizable", "ProteinVariantViewer", diff --git a/src/gpsea/view/_base.py b/src/gpsea/view/_base.py index c13fce1e4..0fb271ca0 100644 --- a/src/gpsea/view/_base.py +++ b/src/gpsea/view/_base.py @@ -2,6 +2,8 @@ import io import typing +from gpsea.model import Cohort, ProteinMetadata +from gpsea.preprocessing import ProteinMetadataService from gpsea.util import open_text_io_handle_for_writing @@ -32,7 +34,6 @@ def was_were(count: int) -> str: class BaseViewer(metaclass=abc.ABCMeta): - def __init__(self): self._environment = Environment(loader=PackageLoader("gpsea.view", "templates")) self._environment.filters["pluralize"] = pluralize @@ -53,21 +54,22 @@ def write(self, fh: typing.Union[io.IOBase, str]): or :class:`io.IOBase` with the file-like object for writing the report into. """ pass - + class HtmlGpseaReport(GpseaReport): """ A report where the content is formatted as a HTML `str`. """ + # NOT PART OF THE PUBLIC API - + def __init__( self, html: str, ): assert isinstance(html, str) self._html = html - + @property def html(self) -> str: """ @@ -87,3 +89,81 @@ def write(self, fh: typing.Union[io.IOBase, str]): def _repr_html_(self) -> str: return self._html + + +class BaseProteinVisualizer(metaclass=abc.ABCMeta): + """ + The visualizer for plotting cohort variants on a protein diagram. + """ + + @abc.abstractmethod + def draw_protein( + self, + cohort: Cohort, + protein_metadata: ProteinMetadata, + ax, + **kwargs, + ): + """ + Draw the locations of the cohort variants on the provided axes. + + Keyword arguments + ----------------- + + The function can take multiple keyword arguments (``kwargs``): + + * `labeling_method`: the strategy for generating labels. + Valid values of labeling_method are ``{'abbreviate', 'enumerate'}`` + + :param cohort: the cohort to plot. + :param protein_metadata: information for the protein of interest. + :param ax: a Matplotlib :class:`~matplotlib.pyplot.Axes` to plot on + """ + pass + + +class CohortArtist: + """ + `CohortArtist` draws all types of figures to visualize + the salient aspects of the :class:`~gpsea.model.Cohort` of interest. + """ + + def __init__( + self, + protein_visualizer: BaseProteinVisualizer, + protein_metadata_service: ProteinMetadataService, + ): + assert isinstance(protein_visualizer, BaseProteinVisualizer) + self._protein_visualizer = protein_visualizer + assert isinstance(protein_metadata_service, ProteinMetadataService) + self._protein_metadata_service = protein_metadata_service + + def draw_protein( + self, + cohort: Cohort, + protein_id: str, + ax, + **kwargs, + ): + """ + Draw the locations of the cohort variants on the provided axes. + + See the :meth:`~gpsea.view.BaseProteinVisualizer.draw_protein` for more info about the arguments. + """ + protein_meta = self._protein_metadata_service.annotate(protein_id) + self._protein_visualizer.draw_protein( + cohort=cohort, + protein_metadata=protein_meta, + ax=ax, + **kwargs, + ) + + # TODO: implement once we know how to draw transcripts! + # def draw_transcript( + # self, + # cohort: Cohort, + # tx_id: str, + # ax, + # **kwargs, + # ): + # pass diff --git a/src/gpsea/view/_config.py b/src/gpsea/view/_config.py new file mode 100644 index 000000000..40f485331 --- /dev/null +++ b/src/gpsea/view/_config.py @@ -0,0 +1,41 @@ +import typing + +from gpsea.preprocessing import configure_default_protein_metadata_service + +from ._base import BaseProteinVisualizer, CohortArtist +from ._protein_visualizer import ProteinVisualizer + +def configure_default_protein_visualizer( + random_seed: int = 42, +) -> BaseProteinVisualizer: + return ProteinVisualizer( + random_seed=random_seed, + ) + +def configure_default_cohort_artist( + protein_source: typing.Literal["UNIPROT"] = "UNIPROT", + cache_dir: typing.Optional[str] = None, + timeout: typing.Union[float, int] = 30.0, + random_seed: int = 42, +) -> CohortArtist: + """ + Get the default :class:`~gpsea.view.CohortArtist`. + + :param protein_source: a `str` with the code of the protein data sources (currently accepting just `UNIPROT`). + :param cache_dir: path to the folder where we will cache the results fetched from the remote APIs or `None` + if the data should be cached as described by :func:`~gpsea.config.get_cache_dir_path` function. + In any case, the directory will be created if it does not exist (including any non-existing parents). + :param timeout: a `float` or an `int` for the timeout in seconds for the REST APIs. + """ + protein_visualizer = configure_default_protein_visualizer( + random_seed=random_seed, + ) + protein_metadata_service = configure_default_protein_metadata_service( + protein_source=protein_source, + cache_dir=cache_dir, + timeout=timeout, + ) + return CohortArtist( + protein_visualizer=protein_visualizer, + protein_metadata_service=protein_metadata_service, + ) diff --git a/src/gpsea/view/_phenotype_analysis.py b/src/gpsea/view/_phenotype_analysis.py index 8be597356..fa9c05374 100644 --- a/src/gpsea/view/_phenotype_analysis.py +++ b/src/gpsea/view/_phenotype_analysis.py @@ -20,10 +20,11 @@ def summarize_hpo_analysis( # Row index: a list of tested HPO terms pheno_idx = pd.Index(result.phenotypes) - # Column index: multiindex of counts and percentages for all genotype predicate groups - gt_idx = pd.MultiIndex.from_product( - iterables=(result.gt_clf.get_categories(), ("Count", "Percent")), - names=(result.gt_clf.variable_name, None), + # Column index: a summary of the counts and percentages for all genotype predicate groups + cat_names = list(cat.name for cat in result.gt_clf.get_categories()) + gt_idx = pd.Index( + data=cat_names, + name=result.gt_clf.variable_name, ) # We'll fill this frame with data @@ -36,16 +37,15 @@ def summarize_hpo_analysis( for gt_cat in count.columns: cnt = count.loc[ph_clf.present_phenotype_category, gt_cat] total = gt_totals[gt_cat] - df.loc[ph_clf.phenotype, (gt_cat, "Count")] = f"{cnt}/{total}" pct = 0 if total == 0 else round(cnt * 100 / total) - df.loc[ph_clf.phenotype, (gt_cat, "Percent")] = f"{pct}%" + df.loc[ph_clf.phenotype, gt_cat.name] = f"{cnt}/{total} ({pct}%)" # Add columns with p values and corrected p values (if present) p_val_col_name = "p values" corrected_p_val_col_name = "Corrected p values" if result.corrected_pvals is not None: - df.insert(df.shape[1], ("", corrected_p_val_col_name), result.corrected_pvals) - df.insert(df.shape[1], ("", p_val_col_name), result.pvals) + df.insert(df.shape[1], corrected_p_val_col_name, result.corrected_pvals) + df.insert(df.shape[1], p_val_col_name, result.pvals) # Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms # or just use the term ID CURIE otherwise (e.g. `OMIM:123000`). @@ -54,13 +54,18 @@ def summarize_hpo_analysis( # Last, sort by corrected p value or just p value df = df.set_index(labeled_idx) # and only report the tested HPO terms - with_p_value = df[("", p_val_col_name)].notna() + with_p_value = df[p_val_col_name].notna() + + sort_columns = [] if result.corrected_pvals is not None: - return df.sort_values( - by=[("", corrected_p_val_col_name), ("", p_val_col_name)] - ).loc[with_p_value] + sort_columns.append(corrected_p_val_col_name) + sort_columns.append(p_val_col_name) + sort_columns.extend(cat_names) else: - return df.sort_values(by=("", p_val_col_name)).loc[with_p_value] + sort_columns.append(p_val_col_name) + sort_columns.extend(cat_names) + + return df.sort_values(by=sort_columns).loc[with_p_value] def format_term_id( diff --git a/src/gpsea/view/_protein_visualizable.py b/src/gpsea/view/_protein_visualizable.py index babac8bed..a2ce400ee 100644 --- a/src/gpsea/view/_protein_visualizable.py +++ b/src/gpsea/view/_protein_visualizable.py @@ -1,4 +1,5 @@ import typing +import warnings from gpsea.model import ( Cohort, @@ -14,19 +15,27 @@ class ProteinVisualizable: + def __init__( self, - tx_coordinates: TranscriptCoordinates, + tx_coordinates: typing.Optional[TranscriptCoordinates], protein_meta: ProteinMetadata, cohort: Cohort, - ) -> None: - self._tx_coordinates = tx_coordinates + ): + # TODO[v1.0.0] - Remove from the public API! + warnings.warn( + "ProteinVisualizable was deprecated and will be remove in 1.0.0", + DeprecationWarning, + stacklevel=2, + ) + self._cohort = cohort self._protein_meta = protein_meta # store the annotations for the target transcript transcript_annotations = ProteinVisualizable._get_tx_anns( - cohort.all_variants(), self._tx_coordinates.identifier + cohort.all_variants(), protein_meta.protein_id, ) + variant_regions_on_protein: typing.List[Region] = list() self._variant_effect = list() for tx_ann in transcript_annotations: @@ -72,22 +81,22 @@ def __init__( @staticmethod def _get_tx_anns( variants: typing.Iterable[Variant], - tx_id: str, + protein_id: str, ) -> typing.Sequence[TranscriptAnnotation]: """ By default, the API returns transcript annotations for many transcripts. - We would like to store the annotations only for our transcript of interest (tx_id) + We would like to store the annotations only for our protein of interest (protein_id) """ tx_anns = [] for i, v in enumerate(variants): tx_ann = None for ann in v.tx_annotations: - if ann.transcript_id == tx_id: + if ann.protein_id is not None and ann.protein_id == protein_id: tx_ann = ann break if tx_ann is None: raise ValueError( - f"The transcript annotation for {tx_id} was not found!" + f"The transcript annotation for {protein_id} was not found!" ) else: tx_anns.append(tx_ann) @@ -95,12 +104,8 @@ def _get_tx_anns( return tx_anns @property - def transcript_coordinates(self) -> TranscriptCoordinates: - return self._tx_coordinates - - @property - def transcript_id(self) -> str: - return self._tx_coordinates.identifier + def cohort(self) -> Cohort: + return self._cohort @property def protein_id(self) -> str: diff --git a/src/gpsea/view/_protein_visualizer.py b/src/gpsea/view/_protein_visualizer.py index 36b5847a0..194c01810 100644 --- a/src/gpsea/view/_protein_visualizer.py +++ b/src/gpsea/view/_protein_visualizer.py @@ -1,6 +1,7 @@ -import random import heapq +import random import typing +import warnings from dataclasses import dataclass from itertools import cycle @@ -11,15 +12,19 @@ from gpsea.model import Cohort, ProteinMetadata, TranscriptCoordinates, VariantEffect +from ._base import BaseProteinVisualizer from ._protein_visualizable import ProteinVisualizable -class ProteinVisualizer: +class ProteinVisualizer(BaseProteinVisualizer): """ Draw a schema of a protein with variants of the cohort. """ - def __init__(self, random_seed: int = 42) -> None: + def __init__( + self, + random_seed: int = 42, + ): # colors mycolors = [m for m in mcolors.CSS4_COLORS.keys() if "grey" not in m and "white" not in m] rng = random.Random(random_seed) @@ -53,80 +58,66 @@ def __init__(self, random_seed: int = 42) -> None: self.legend2_max_x = 0.3 self.legend2_max_y = 0.75 - def draw_protein_diagram( + def draw_protein( self, - tx_coordinates: TranscriptCoordinates, - protein_metadata: ProteinMetadata, cohort: Cohort, - ax: typing.Optional[plt.Axes] = None, - labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' - ) -> typing.Optional[plt.Axes]: - pvis = ProteinVisualizable(tx_coordinates, protein_metadata, cohort) - return self.draw_fig(pvis, ax, labeling_method) - - def draw_fig( - self, - pvis: ProteinVisualizable, - ax: typing.Optional[plt.Axes] = None, - labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' - ) -> typing.Optional[plt.Axes]: - """ - Visualize the cohort variants on a protein diagram. - - By default, the legend is drawn to the right of the figure to avoid overlap between the variant markers - and the legend. - - Args: - pvis: :class:`ProteinVisualizable` with information about the transcript coordinates, protein metadata, - and the cohort for plotting - ax: a Matplotlib :class:`plt.Axes` to plot on or `None` if a new `Axes` should be created - labeling_method: the strategy for generating labels. - Valid values of labeling_method are `{'abbreviate', 'enumerate'}` - Returns: - `None` if an :class:`plt.Axes` was provided via `ax` argument - or an `Axes` created by the visualizer if `ax` was `None`. - """ - if ax is None: - should_return_ax = True - _, ax = plt.subplots(figsize=(20, 20)) - else: - should_return_ax = False - + protein_metadata: ProteinMetadata, + ax, + **kwargs, + ): + labeling_method = kwargs.pop("labeling_method", "abbreviate") + starts = [] + ends = [] + names = [] + for feature in protein_metadata.protein_features: + starts.append(feature.info.start) + ends.append(feature.info.end) + names.append(feature.info.name) + # STATE - feature_handler = DrawableProteinFeatureHandler(pvis, labeling_method, self._available_colors) + feature_handler = DrawableProteinFeatureHandler( + starts, + ends, + names, + labeling_method, + self._available_colors, + ) max_overlapping_features = sweep_line( [(f.min_pos_abs, f.max_pos_abs) for f in feature_handler.features] # define intervals ) - variant_handler = DrawableProteinVariantHandler(pvis) + variant_handler = DrawableProteinVariantHandler( + cohort=cohort, + protein_meta=protein_metadata, + ) - x_ticks = generate_ticks(apprx_n_ticks=6, min=1, max=pvis.protein_length) + x_ticks = generate_ticks(apprx_n_ticks=6, min=1, max=protein_metadata.protein_length) y_ticks = generate_ticks(apprx_n_ticks=5, min=0, max=variant_handler.max_marker_count) # normalize into [0, 1], leaving some space on the sides for f in feature_handler.features: cur_limits = f.min_pos_abs, f.max_pos_abs f.min_pos_plotting, f.max_pos_plotting = translate_to_ax_coordinates( - np.array(cur_limits), min_absolute=1, max_absolute=pvis.protein_length, + np.array(cur_limits), min_absolute=1, max_absolute=protein_metadata.protein_length, min_relative=self.protein_track_x_min, max_relative=self.protein_track_x_max ) for v in variant_handler.variants: pos_abs = v.pos_abs v.pos_plotting = translate_to_ax_coordinates( - np.array([pos_abs]), min_absolute=1, max_absolute=pvis.protein_length, + np.array([pos_abs]), min_absolute=1, max_absolute=protein_metadata.protein_length, min_relative=self.protein_track_x_min, max_relative=self.protein_track_x_max, clip=True) x_ticks_relative = translate_to_ax_coordinates(x_ticks, min_absolute=1, - max_absolute=pvis.protein_length, + max_absolute=protein_metadata.protein_length, min_relative=self.protein_track_x_min, max_relative=self.protein_track_x_max) # PLOTTING draw_axes(ax, x_ticks, x_ticks_relative, y_ticks, - variant_handler.max_marker_count, 1, pvis.protein_length, max_overlapping_features, + variant_handler.max_marker_count, 1, protein_metadata.protein_length, max_overlapping_features, self.protein_track_x_min, self.protein_track_x_max, self.protein_track_y_max, self.protein_track_height, self.protein_track_buffer, self.font_size, self.text_padding, @@ -140,7 +131,7 @@ def draw_fig( self.protein_feature_height, self.protein_feature_outline_color) - legend1_width = draw_legends(ax, feature_handler, pvis, + legend1_width = draw_legends(ax, feature_handler, self.color_box_x_dim, self.color_box_y_dim, self.color_circle_radius, self.row_spacing, self.legend1_min_x, self.legend1_max_y, @@ -152,12 +143,82 @@ def draw_fig( xlim=(0, max(1.0, self.legend1_min_x + legend1_width + 0.02)), ylim=(0.3, 0.75), aspect='equal', - title=f'{pvis.protein_metadata.label}\ntranscript: {pvis.transcript_id}, ' - f'protein: {pvis.protein_id}', + title=f'{protein_metadata.label}\n' + f'protein: {protein_metadata.protein_id}', ) ax.axis('off') + def draw_protein_diagram( + self, + tx_coordinates: TranscriptCoordinates, + protein_metadata: ProteinMetadata, + cohort: Cohort, + ax: typing.Optional[plt.Axes] = None, + labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' + ) -> typing.Optional[plt.Axes]: + warnings.warn( + "draw_protein_diagram was deprecated and will be removed in `1.0.0`. Use `draw_protein` instead", + DeprecationWarning, + stacklevel=2, + ) + if ax is None: + should_return_ax = True + _, ax = plt.subplots(figsize=(20, 20)) + else: + should_return_ax = False + + self.draw_protein( + cohort=cohort, + protein_metadata=protein_metadata, + ax=ax, + labeling_method=labeling_method, + ) + + if should_return_ax: + return ax + + + def draw_fig( + self, + pvis: ProteinVisualizable, + ax: typing.Optional[plt.Axes] = None, + labeling_method: typing.Literal['abbreviate', 'enumerate'] = 'abbreviate' + ) -> typing.Optional[plt.Axes]: + """ + Visualize the cohort variants on a protein diagram. + + By default, the legend is drawn to the right of the figure to avoid overlap between the variant markers + and the legend. + + Args: + pvis: :class:`ProteinVisualizable` with information about the transcript coordinates, protein metadata, + and the cohort for plotting + ax: a Matplotlib :class:`plt.Axes` to plot on or `None` if a new `Axes` should be created + labeling_method: the strategy for generating labels. + Valid values of labeling_method are `{'abbreviate', 'enumerate'}` + Returns: + `None` if an :class:`plt.Axes` was provided via `ax` argument + or an `Axes` created by the visualizer if `ax` was `None`. + """ + warnings.warn( + "draw_fig was deprecated and will be removed in `1.0.0`. Use `draw_protein` instead", + DeprecationWarning, + stacklevel=2, + ) + if ax is None: + should_return_ax = True + _, ax = plt.subplots(figsize=(20, 20)) + else: + should_return_ax = False + + self.draw_protein( + cohort=pvis.cohort, + protein_metadata=pvis.protein_metadata, + ax=ax, + labeling_method=labeling_method, + ) + if should_return_ax: return ax @@ -201,8 +262,17 @@ def draw(self, ax: plt.Axes, features_y_max: float, feature_height: float, featu class DrawableProteinFeatureHandler: - def __init__(self, pvis: ProteinVisualizable, labeling_method: str, colors: typing.Sequence[str]): - self.pvis = pvis + def __init__( + self, + starts: typing.Sequence[int], + ends: typing.Sequence[int], + feature_names: typing.Sequence[str], + labeling_method: str, + colors: typing.Sequence[str], + ): + self._starts = starts + self._ends = ends + self._feature_names = feature_names self.labeling_method = labeling_method self._available_colors = colors @@ -212,7 +282,7 @@ def __init__(self, pvis: ProteinVisualizable, labeling_method: str, colors: typi def _generate_labels(self): # aggregate similar feature names into one category - unique_feature_names = list(set(self.pvis.protein_feature_names)) + unique_feature_names = list(set(self._feature_names)) cleaned_unique_feature_names = set() mapping_all2cleaned = dict() for feature_name in unique_feature_names: @@ -237,20 +307,19 @@ def _generate_labels(self): return cleaned_unique_feature_names, mapping_all2cleaned, labels, colors - def _generate_features(self): - features = list() - for i in range(len(self.pvis.protein_feature_ends)): - features.append(DrawableProteinFeature( - min_pos_abs=self.pvis.protein_feature_starts[i], - max_pos_abs=self.pvis.protein_feature_ends[i], - name=self.pvis.protein_feature_names[i], - label=self.labels[self.mapping_all2cleaned[self.pvis.protein_feature_names[i]]], - color=self.colors[self.mapping_all2cleaned[self.pvis.protein_feature_names[i]]], + def _generate_features(self) -> typing.Sequence[DrawableProteinFeature]: + return [ + DrawableProteinFeature( + min_pos_abs=start, + max_pos_abs=end, + name=name, + label=self.labels[self.mapping_all2cleaned[name]], + color=self.colors[self.mapping_all2cleaned[name]], min_pos_plotting=-1.0, max_pos_plotting=-1.0, # will be set later in draw_fig(), when plot limits known track=0 - )) - - return features + ) + for start, end, name in zip(self._starts, self._ends, self._feature_names) + ] def draw_features(self, ax: plt.Axes, features_y_max: float, feature_height: float, feature_outline_color: str): for f in self.features: @@ -287,8 +356,17 @@ def name(self): class DrawableProteinVariantHandler: - def __init__(self, pvis: ProteinVisualizable, aggregation_method: typing.Literal['standard', 'disease'] = 'standard'): - self.pvis = pvis + def __init__( + self, + cohort: Cohort, + protein_meta: ProteinMetadata, + aggregation_method: typing.Literal['standard', 'disease'] = 'standard', + ): + self._pvis = ProteinVisualizable( + tx_coordinates=None, + protein_meta=protein_meta, + cohort=cohort, + ) if aggregation_method in ['standard', 'disease']: self.aggregation_method = aggregation_method else: @@ -336,7 +414,7 @@ def __init__(self, pvis: ProteinVisualizable, aggregation_method: typing.Literal VariantEffect.INTERGENIC_VARIANT: "#ff00ff", VariantEffect.SEQUENCE_VARIANT: "#33ff00", } - self.max_marker_count = np.max(self.pvis.marker_counts) + self.max_marker_count = np.max(self._pvis.marker_counts) if self.aggregation_method == 'standard': self.variants = self._generate_variant_markers() elif self.aggregation_method == 'disease': @@ -344,14 +422,14 @@ def __init__(self, pvis: ProteinVisualizable, aggregation_method: typing.Literal def _generate_variant_markers(self): variants = list() - for j, vl in enumerate(self.pvis.variant_locations_counted_absolute): - i = np.where(self.pvis.variant_locations == vl)[0][0] - effect = self.pvis.variant_effects[i] + for j, vl in enumerate(self._pvis.variant_locations_counted_absolute): + i = np.where(self._pvis.variant_locations == vl)[0][0] + effect = self._pvis.variant_effects[i] v = DrawableProteinVariant(effect=effect, pos_abs=vl, color=self.variant_effect2color[effect], pos_plotting=-1.0, - count=self.pvis.marker_counts[j]) + count=self._pvis.marker_counts[j]) variants.append(v) return variants @@ -551,28 +629,33 @@ def translate_to_ax_coordinates( return shifted_to_0_1 * relative_scale + min_relative -def assign_colors(names: typing.Iterable[str], available_colors: typing.Sequence[str]): +def assign_colors( + names: typing.Iterable[str], + available_colors: typing.Sequence[str], +) -> typing.Mapping[int, str]: num_colors = 0 - _colors = dict() + colors = dict() seen = set() for n in names: if n in seen: continue seen.add(n) color = available_colors[num_colors] - _colors[n] = color + colors[n] = color num_colors += 1 - return _colors + return colors -def draw_legends(ax: plt.Axes, feature_handler, pvis, - color_box_x_dim, color_box_y_dim, color_circle_radius, row_spacing, - legend1_min_x, legend1_max_y, - legend2_min_x, legend2_max_y, - variant_effect_colors, - labeling_method, - ): +def draw_legends( + ax: plt.Axes, + feature_handler, + color_box_x_dim, color_box_y_dim, color_circle_radius, row_spacing, + legend1_min_x, legend1_max_y, + legend2_min_x, legend2_max_y, + variant_effect_colors, + labeling_method, +): # draw legend 1 for protein features n_unique_features = len(feature_handler.cleaned_unique_feature_names) if labeling_method == 'abbreviate': @@ -627,7 +710,11 @@ def draw_legends(ax: plt.Axes, feature_handler, pvis, return legend1_width -def sweep_line(intervals: typing.Iterable[typing.Tuple[int, int]]) -> int: +def sweep_line( + intervals: typing.Iterable[ + typing.Tuple[typing.Union[int, float], typing.Union[int, float]], + ], +) -> int: """ Given a list of intervals, find the maximum number of overlapping intervals. diff --git a/tests/analysis/clf/test_gt_predicates.py b/tests/analysis/clf/test_gt_predicates.py index bd00577e2..800ff0a3c 100644 --- a/tests/analysis/clf/test_gt_predicates.py +++ b/tests/analysis/clf/test_gt_predicates.py @@ -133,7 +133,7 @@ class TestAllelePredicates: @pytest.mark.parametrize( "individual_name,expected_name", [ - ("adam", "B"), # 0/0 & 0/1 + ("adam", "A^C"), # 0/0 & 0/1 ("eve", "A"), # 0/1 & 0/0 ("cain", "A"), # 0/1 & 0/0 ], @@ -162,15 +162,15 @@ def test_monoallelic_predicate__general_stuff( gt_predicate = monoallelic_classifier(is_missense, is_synonymous) - assert gt_predicate.summarize_classes() == "Allele group: A, B" + assert gt_predicate.summarize_classes() == "Allele group: A, A^C" @pytest.mark.parametrize( "individual_name,expected_name", [ - ("walt", "A/B"), # 0/1 & 0/1 - ("skyler", "A/B"), # 0/1 & 0/1 + ("walt", "A/A^C"), # 0/1 & 0/1 + ("skyler", "A/A^C"), # 0/1 & 0/1 ("flynn", "A/A"), # 1/1 & 0/0 - ("holly", "B/B"), # 0/0 & 1/1 + ("holly", "A^C/A^C"), # 0/0 & 1/1 ], ) def test_biallelic_predicate( @@ -197,7 +197,7 @@ def test_biallelic_predicate__general_stuff( gt_predicate = biallelic_classifier(is_missense, is_synonymous) - assert gt_predicate.summarize_classes() == "Allele group: A/A, A/B, B/B" + assert gt_predicate.summarize_classes() == "Allele group: A/A, A/A^C, A^C/A^C" class TestSexPredicate: diff --git a/tests/conftest.py b/tests/conftest.py index ae9d9f216..a19bd6a3e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -150,6 +150,11 @@ def suox_mane_tx_id() -> str: return "NM_001032386.2" +@pytest.fixture(scope="session") +def suox_mane_protein_id() -> str: + return "NP_001027558.1" + + @pytest.fixture(scope="session") def suox_gt_clf( suox_mane_tx_id: str, @@ -211,9 +216,11 @@ def suox_mane_tx_coordinates( @pytest.fixture(scope="session") -def fpath_suox_protein_metadata(fpath_test_data_dir: str) -> str: - suox_mane_tx_protein_id = "NP_001027558.1" - return os.path.join(fpath_test_data_dir, f"SUOX-{suox_mane_tx_protein_id}.json") +def fpath_suox_protein_metadata( + fpath_test_data_dir: str, + suox_mane_protein_id: str, +) -> str: + return os.path.join(fpath_test_data_dir, f"SUOX-{suox_mane_protein_id}.json") @pytest.fixture(scope="session") diff --git a/tests/view/conftest.py b/tests/view/conftest.py new file mode 100644 index 000000000..c39740484 --- /dev/null +++ b/tests/view/conftest.py @@ -0,0 +1,58 @@ +import math + +import hpotk +import pytest +import pandas as pd + +from gpsea.analysis import StatisticResult +from gpsea.analysis.clf import HpoClassifier, GenotypeClassifier +from gpsea.analysis.mtc_filter import PhenotypeMtcResult +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.analysis.pcats.stats import FisherExactTest + + +@pytest.fixture(scope="package") +def hpo_term_analysis_result( + hpo: hpotk.MinimalOntology, + suox_gt_clf: GenotypeClassifier, +) -> HpoTermAnalysisResult: + is_arachnodactyly = HpoClassifier( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0001166"), # Arachnodactyly + ) + is_seizure = HpoClassifier( + hpo=hpo, + query=hpotk.TermId.from_curie("HP:0001250"), # Seizure + ) + return HpoTermAnalysisResult( + gt_clf=suox_gt_clf, + statistic=FisherExactTest(), + mtc_correction="fdr_bh", + pheno_clfs=( + is_arachnodactyly, + is_seizure, + ), + n_usable=(40, 20), + all_counts=( + pd.DataFrame( + data=[[10, 5], [10, 15]], + index=pd.Index(is_arachnodactyly.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), + ), + pd.DataFrame( + data=[[5, 0], [5, 10]], + index=pd.Index(is_seizure.get_categories()), + columns=pd.Index(suox_gt_clf.get_categories()), + ), + ), + statistic_results=( + StatisticResult(statistic=None, pval=math.nan), + StatisticResult(statistic=1.23, pval=0.01), + ), + corrected_pvals=(math.nan, 0.01), + mtc_filter_name="Random MTC filter", + mtc_filter_results=( + PhenotypeMtcResult.fail("RMF01", "Not too interesting"), + PhenotypeMtcResult.ok(), + ), + ) diff --git a/tests/view/test_cohort_artist.py b/tests/view/test_cohort_artist.py new file mode 100644 index 000000000..be0e7ade2 --- /dev/null +++ b/tests/view/test_cohort_artist.py @@ -0,0 +1,28 @@ +import matplotlib.pyplot as plt + +import pytest + +from gpsea.model import Cohort +from gpsea.view import configure_default_cohort_artist, CohortArtist + + +class TestCohortArtist: + @pytest.fixture(scope="class") + def cohort_artist(self) -> CohortArtist: + return configure_default_cohort_artist() + + @pytest.mark.skip("Run manually on demand") + def test_draw_protein( + self, + suox_cohort: Cohort, + suox_mane_protein_id: str, + cohort_artist: CohortArtist, + ): + fig, ax = plt.subplots(figsize=(20, 20)) + cohort_artist.draw_protein( + cohort=suox_cohort, + protein_id=suox_mane_protein_id, + ax=ax, + ) + + fig.savefig("protein.png") diff --git a/tests/view/test_phenotype_analysis.py b/tests/view/test_phenotype_analysis.py new file mode 100644 index 000000000..aaf0104e6 --- /dev/null +++ b/tests/view/test_phenotype_analysis.py @@ -0,0 +1,18 @@ +import hpotk +import pytest + +from gpsea.analysis.pcats import HpoTermAnalysisResult +from gpsea.view import summarize_hpo_analysis + + +@pytest.mark.skip("Just for manual testing and debugging") +def test_summarize_hpo_analysis( + hpo: hpotk.MinimalOntology, + hpo_term_analysis_result: HpoTermAnalysisResult, +): + df = summarize_hpo_analysis( + hpo=hpo, + result=hpo_term_analysis_result, + ) + + print(df) diff --git a/tests/view/test_protein_visualizer.py b/tests/view/test_protein_visualizer.py index e983381e2..c454f1fad 100644 --- a/tests/view/test_protein_visualizer.py +++ b/tests/view/test_protein_visualizer.py @@ -2,43 +2,31 @@ import matplotlib.pyplot as plt import pytest -from gpsea.model import TranscriptCoordinates, ProteinMetadata, Cohort +from gpsea.model import Cohort, ProteinMetadata from gpsea.view import ( GpseaReport, - ProteinVisualizer, - ProteinVisualizable, + configure_default_protein_visualizer, + BaseProteinVisualizer, ProteinVariantViewer, ) class TestProteinVisualizer: - @pytest.fixture(scope="class") - def visualizer(self) -> ProteinVisualizer: - return ProteinVisualizer() - - @pytest.fixture(scope="class") - def visualizable( - self, - suox_mane_tx_coordinates: TranscriptCoordinates, - suox_protein_metadata: ProteinMetadata, - suox_cohort: Cohort, - ) -> ProteinVisualizable: - return ProteinVisualizable( - tx_coordinates=suox_mane_tx_coordinates, - protein_meta=suox_protein_metadata, - cohort=suox_cohort, - ) + def visualizer(self) -> BaseProteinVisualizer: + return configure_default_protein_visualizer() @pytest.mark.skip("Run manually on demand") def test_protein_visualizer( self, - visualizer: ProteinVisualizer, - visualizable: ProteinVisualizable, + visualizer: BaseProteinVisualizer, + suox_cohort: Cohort, + suox_protein_metadata: ProteinMetadata, ): fig, ax = plt.subplots(figsize=(20, 20)) - visualizer.draw_fig( - pvis=visualizable, + visualizer.draw_protein( + cohort=suox_cohort, + protein_metadata=suox_protein_metadata, ax=ax, ) diff --git a/tests/view/test_view.py b/tests/view/test_viewers.py similarity index 54% rename from tests/view/test_view.py rename to tests/view/test_viewers.py index 9dea42967..81d5414f5 100644 --- a/tests/view/test_view.py +++ b/tests/view/test_viewers.py @@ -72,52 +72,6 @@ def test_viewer( class TestMtcStatsViewer: - @pytest.fixture(scope="class") - def hpo_term_analysis_result( - self, - hpo: hpotk.MinimalOntology, - suox_gt_clf: GenotypeClassifier, - ) -> HpoTermAnalysisResult: - is_arachnodactyly = HpoClassifier( - hpo=hpo, - query=hpotk.TermId.from_curie("HP:0001166"), # Arachnodactyly - ) - is_seizure = HpoClassifier( - hpo=hpo, - query=hpotk.TermId.from_curie("HP:0001250"), # Seizure - ) - return HpoTermAnalysisResult( - gt_clf=suox_gt_clf, - statistic=FisherExactTest(), - mtc_correction="fdr_bh", - pheno_clfs=( - is_arachnodactyly, - is_seizure, - ), - n_usable=(40, 20), - all_counts=( - pd.DataFrame( - data=[[10, 5], [10, 15]], - index=pd.Index(is_arachnodactyly.get_categories()), - columns=pd.Index(suox_gt_clf.get_categories()), - ), - pd.DataFrame( - data=[[5, 0], [5, 10]], - index=pd.Index(is_seizure.get_categories()), - columns=pd.Index(suox_gt_clf.get_categories()), - ), - ), - statistic_results=( - StatisticResult(statistic=None, pval=math.nan), - StatisticResult(statistic=1.23, pval=0.01), - ), - corrected_pvals=(math.nan, 0.01), - mtc_filter_name="Random MTC filter", - mtc_filter_results=( - PhenotypeMtcResult.fail("RMF01", "Not too interesting"), - PhenotypeMtcResult.ok(), - ), - ) @pytest.fixture def stats_viewer(self) -> MtcStatsViewer: @@ -132,13 +86,3 @@ def test_process( report = stats_viewer.process(result=hpo_term_analysis_result) with open("mtc_stats.html", "w") as fh: report.write(fh) - - -@pytest.mark.skip("Just for manual testing and debugging") -def test_summarize( - hpo: hpotk.MinimalOntology, - hpo_result: HpoTermAnalysisResult, -): - df = summarize_hpo_analysis(hpo=hpo, result=hpo_result) - - print(df)