Skip to content

Commit

Permalink
Merge pull request #254 from monarch-initiative/improve-summarize
Browse files Browse the repository at this point in the history
Improve summarize
  • Loading branch information
ielis authored Sep 9, 2024
2 parents a314f26 + 9e75d23 commit 51c439c
Show file tree
Hide file tree
Showing 12 changed files with 208 additions and 125 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Cache with transcript/protein pickle files
.gpsea_cache
dev/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
10 changes: 5 additions & 5 deletions docs/report/tbx5_frameshift_vs_missense.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ Muscular ventricular septal defect [HP:0011623],6/59,10%,6/25,24%,0.286867598598
Pulmonary arterial hypertension [HP:0002092],4/6,67%,0/2,0%,0.6623376623376622,0.42857142857142855
Hypoplasia of the ulna [HP:0003022],1/12,8%,2/10,20%,0.8095238095238093,0.5714285714285713
Hypoplasia of the radius [HP:0002984],30/62,48%,6/14,43%,1.0,0.7735491022101784
Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0
Atrial septal defect [HP:0001631],42/44,95%,20/20,100%,1.0,1.0
Short thumb [HP:0009778],11/41,27%,8/30,27%,1.0,1.0
Absent radius [HP:0003974],7/32,22%,6/25,24%,1.0,1.0
Aplasia/Hypoplasia of the thumb [HP:0009601],20/20,100%,19/19,100%,,
Aplasia/Hypoplasia of fingers [HP:0006265],22/22,100%,19/19,100%,,
Aplasia/hypoplasia involving bones of the hand [HP:0005927],22/22,100%,19/19,100%,,
Short humerus [HP:0005792],7/17,41%,4/9,44%,1.0,1.0
Abnormal atrial septum morphology [HP:0011994],43/43,100%,20/20,100%,,
Abnormal cardiac septum morphology [HP:0001671],62/62,100%,28/28,100%,,
Abnormal heart morphology [HP:0001627],62/62,100%,30/30,100%,,
4 changes: 2 additions & 2 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,8 @@ by exploring the phenotype MTC filtering report.

and these are the top 20 HPO terms ordered by the p value corrected with the Benjamini-Hochberg procedure:

>>> from gpsea.analysis.predicate import PatientCategories
>>> summary_df = result.summarize(hpo, PatientCategories.YES)
>>> from gpsea.view import summarize_hpo_analysis
>>> summary_df = summarize_hpo_analysis(hpo, result)
>>> summary_df.head(20).to_csv('docs/report/tbx5_frameshift_vs_missense.csv') # doctest: +SKIP

.. csv-table:: *TBX5* frameshift vs missense
Expand Down
4 changes: 2 additions & 2 deletions docs/user-guide/stats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ Last, let's explore the associations. The results include a table with all teste
ordered by the corrected p value (Benjamini-Hochberg FDR).
Here we show the top 20 table rows:

>>> from gpsea.analysis.predicate import PatientCategories
>>> summary_df = result.summarize(hpo, PatientCategories.YES)
>>> from gpsea.view import summarize_hpo_analysis
>>> summary_df = summarize_hpo_analysis(hpo, result)
>>> summary_df.head(20).to_csv('docs/user-guide/report/tbx5_frameshift.csv') # doctest: +SKIP

.. csv-table:: *TBX5* frameshift vs rest
Expand Down
134 changes: 22 additions & 112 deletions src/gpsea/analysis/pcats/_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from gpsea.model import Patient
from gpsea.analysis.pcats.stats import CountStatistic
from ..predicate import PatientCategory
from ..predicate.genotype import GenotypePolyPredicate
from ..predicate.phenotype import P, PhenotypePolyPredicate
from ..mtc_filter import PhenotypeMtcFilter, PhenotypeMtcResult
Expand Down Expand Up @@ -101,14 +100,6 @@ class MultiPhenotypeAnalysisResult(typing.Generic[P], metaclass=abc.ABCMeta):
with the order determined by the phenotype order.
"""

@property
@abc.abstractmethod
def phenotypes(self) -> typing.Sequence[P]:
"""
Get the tested phenotypes.
"""
pass

@property
@abc.abstractmethod
def n_usable(self) -> typing.Sequence[int]:
Expand Down Expand Up @@ -165,95 +156,6 @@ def total_tests(self) -> int:
"""
return sum(1 for pval in self.pvals if not math.isnan(pval))

def summarize(
self,
hpo: hpotk.MinimalOntology,
target_phenotype_category: PatientCategory,
) -> pd.DataFrame:
"""
Create a data frame with summary of the genotype phenotype analysis.
The *rows* of the frame correspond to the analyzed HPO terms.
The columns of the data frame have `Count` and `Percentage` per used genotype predicate.
**Example**
If we use :class:`~gpsea.analysis.predicate.genotype.VariantEffectPredicate`
which can compare phenotype with and without a missense variant, we will have a data frame
that looks like this::
MISSENSE_VARIANT on `NM_1234.5` No Yes
Count Percent Count Percent Corrected p value p value
Arachnodactyly [HP:0001166] 1/10 10% 13/16 81% 0.020299 0.000781
Abnormality of the musculature [HP:0003011] 6/6 100% 11/11 100% 1.000000 1.000000
Abnormal nervous system physiology [HP:0012638] 9/9 100% 15/15 100% 1.000000 1.000000
... ... ... ... ... ... ...
"""

# TODO: this should be plotted by a formatter.
# Row index: a list of tested HPO terms
pheno_idx = pd.Index(self.phenotypes)
# Column index: multiindex of counts and percentages for all genotype predicate groups
gt_idx = pd.MultiIndex.from_product(
# TODO: fix the below
iterables=(self._gt_predicate.get_categories(), ("Count", "Percent")),
names=(self._gt_predicate.get_question_base(), None),
)

# We'll fill this frame with data
df = pd.DataFrame(index=pheno_idx, columns=gt_idx)

for phenotype, count in zip(self.phenotypes, self.all_counts):
gt_totals = (
count.sum()
) # Sum across the phenotype categories (collapse the rows).
for gt_cat in count.columns:
cnt = count.loc[target_phenotype_category, gt_cat]
total = gt_totals[gt_cat]
df.loc[phenotype, (gt_cat, "Count")] = f"{cnt}/{total}"
pct = 0 if total == 0 else round(cnt * 100 / total)
df.loc[phenotype, (gt_cat, "Percent")] = f"{pct}%"

# Add columns with p values and corrected p values (if present)
p_val_col_name = "p values"
corrected_p_val_col_name = "Corrected p values"
if self.corrected_pvals is not None:
df.insert(
df.shape[1], ("", corrected_p_val_col_name), self.corrected_pvals
)
df.insert(df.shape[1], ("", p_val_col_name), self.pvals)

# Format the index values: `HP:0001250` -> `Seizure [HP:0001250]` if the index members are HPO terms
# or just use the term ID CURIE otherwise (e.g. `OMIM:123000`).
labeled_idx = df.index.map(
lambda term_id: MultiPhenotypeAnalysisResult._format_term_id(hpo, term_id)
)

# Last, sort by corrected p value or just p value
df = df.set_index(labeled_idx)
if self.corrected_pvals is not None:
return df.sort_values(
by=[("", corrected_p_val_col_name), ("", p_val_col_name)]
)
else:
return df.sort_values(by=("", p_val_col_name))

@staticmethod
def _format_term_id(
hpo: hpotk.MinimalOntology,
term_id: hpotk.TermId,
) -> str:
"""
Format a `term_id` as a `str`. HPO term ID is formatted as `<name> [<term_id>]` whereas other term IDs
are formatted as CURIEs (e.g. `OMIM:123000`).
"""
if term_id.prefix == "HP":
term_name = hpo.get_term_name(term_id)
return f"{term_name} [{term_id.value}]"
else:
return term_id.value


class MultiPhenotypeAnalysis(typing.Generic[P], metaclass=abc.ABCMeta):

Expand Down Expand Up @@ -377,7 +279,8 @@ def _compute_nominal_pvals(
def _apply_mtc(
self,
pvals: typing.Sequence[float],
) -> typing.Optional[typing.Sequence[float]]:
) -> typing.Sequence[float]:
assert self._mtc_correction is not None
_, corrected_pvals, _, _ = multitest.multipletests(
pvals=pvals,
alpha=self._mtc_alpha,
Expand All @@ -392,14 +295,14 @@ class BaseMultiPhenotypeAnalysisResult(typing.Generic[P], MultiPhenotypeAnalysis

def __init__(
self,
phenotypes: typing.Sequence[P],
pheno_predicates: typing.Iterable[PhenotypePolyPredicate[P]],
n_usable: typing.Sequence[int],
all_counts: typing.Sequence[pd.DataFrame],
pvals: typing.Sequence[float],
corrected_pvals: typing.Optional[typing.Sequence[float]],
gt_predicate: GenotypePolyPredicate,
):
self._phenotypes = tuple(phenotypes)
self._pheno_predicates = tuple(pheno_predicates)
self._n_usable = tuple(n_usable)
self._all_counts = tuple(all_counts)
self._pvals = tuple(pvals)
Expand All @@ -409,9 +312,19 @@ def __init__(
assert isinstance(gt_predicate, GenotypePolyPredicate)
self._gt_predicate = gt_predicate

@property
def gt_predicate(self) -> GenotypePolyPredicate:
return self._gt_predicate

@property
def pheno_predicates(
self,
) -> typing.Sequence[PhenotypePolyPredicate[P]]:
return self._pheno_predicates

@property
def phenotypes(self) -> typing.Sequence[P]:
return self._phenotypes
return tuple(p.phenotype for p in self._pheno_predicates)

@property
def n_usable(self) -> typing.Sequence[int]:
Expand All @@ -431,7 +344,7 @@ def corrected_pvals(self) -> typing.Optional[typing.Sequence[float]]:

def __eq__(self, other):
return isinstance(other, BaseMultiPhenotypeAnalysisResult) \
and self._phenotypes == other._phenotypes \
and self._pheno_predicates == other._pheno_predicates \
and self._n_usable == other._n_usable \
and self._all_counts == other._all_counts \
and self._pvals == other._pvals \
Expand All @@ -441,7 +354,7 @@ def __eq__(self, other):
def __hash__(self):
# NOTE: if a field is added, the hash method of the subclasses must be updated as well.
return hash((
self._phenotypes, self._n_usable, self._all_counts,
self._pheno_predicates, self._n_usable, self._all_counts,
self._pvals, self._corrected_pvals, self._gt_predicate,
))

Expand Down Expand Up @@ -474,10 +387,8 @@ def _compute_result(
else:
corrected_pvals = self._apply_mtc(pvals=pvals)

phenotypes = tuple(p.phenotype for p in pheno_predicates)

return BaseMultiPhenotypeAnalysisResult(
phenotypes=phenotypes,
pheno_predicates=pheno_predicates,
n_usable=n_usable,
all_counts=all_counts,
pvals=pvals,
Expand All @@ -496,7 +407,7 @@ class HpoTermAnalysisResult(BaseMultiPhenotypeAnalysisResult[hpotk.TermId]):

def __init__(
self,
phenotypes: typing.Sequence[hpotk.TermId],
pheno_predicates: typing.Iterable[PhenotypePolyPredicate[hpotk.TermId]],
n_usable: typing.Sequence[int],
all_counts: typing.Sequence[pd.DataFrame],
pvals: typing.Sequence[float],
Expand All @@ -507,7 +418,7 @@ def __init__(
mtc_name: typing.Optional[str],
):
super().__init__(
phenotypes=phenotypes,
pheno_predicates=pheno_predicates,
n_usable=n_usable,
all_counts=all_counts,
pvals=pvals,
Expand Down Expand Up @@ -549,7 +460,7 @@ def __eq__(self, other):

def __hash__(self):
return hash((
self._phenotypes, self._n_usable, self._all_counts,
self._pheno_predicates, self._n_usable, self._all_counts,
self._pvals, self._corrected_pvals, self._gt_predicate,
self._mtc_filter_name, self._mtc_filter_results, self._mtc_name,
))
Expand Down Expand Up @@ -590,7 +501,6 @@ def _compute_result(
pheno_predicates = tuple(pheno_predicates)
if len(pheno_predicates) == 0:
raise ValueError("No phenotype predicates were provided")
phenotypes = tuple(p.phenotype for p in pheno_predicates)

# 1 - Count the patients
n_usable, all_counts = apply_predicates_on_patients(
Expand Down Expand Up @@ -625,7 +535,7 @@ def _compute_result(
corrected_pvals[mtc_mask] = self._apply_mtc(pvals=pvals[mtc_mask])

return HpoTermAnalysisResult(
phenotypes=phenotypes,
pheno_predicates=pheno_predicates,
n_usable=n_usable,
all_counts=all_counts,
pvals=pvals,
Expand Down
2 changes: 2 additions & 0 deletions src/gpsea/analysis/predicate/genotype/_gt_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from ._api import VariantPredicate
from ._counter import AlleleCounter

# TODO: implement __hash__, __eq__ on predicates


class AlleleCountingGenotypeBooleanPredicate(GenotypePolyPredicate):
# NOT PART OF THE PUBLIC API
Expand Down
16 changes: 16 additions & 0 deletions src/gpsea/analysis/predicate/phenotype/_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,15 @@ def test(

return None

def __eq__(self, value: object) -> bool:
return isinstance(value, PropagatingPhenotypePredicate) \
and self._hpo.version == value._hpo.version \
and self._query == value._query \
and self._missing_implies_phenotype_excluded == value._missing_implies_phenotype_excluded

def __hash__(self) -> int:
return hash((self._hpo.version, self._query, self._missing_implies_phenotype_excluded))

def __repr__(self):
return f"PropagatingPhenotypeBooleanPredicate(query={self._query})"

Expand Down Expand Up @@ -264,5 +273,12 @@ def test(

return self._diagnosis_excluded

def __eq__(self, value: object) -> bool:
return isinstance(value, DiseasePresencePredicate) \
and self._query == value._query

def __hash__(self) -> int:
return hash((self._query,))

def __repr__(self):
return f"DiseasePresencePredicate(query={self._query})"
2 changes: 2 additions & 0 deletions src/gpsea/view/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ._cohort import CohortViewable
from ._disease import DiseaseViewable
from ._phenotype_analysis import summarize_hpo_analysis
from ._protein_viewer import ProteinViewable
from ._protein_visualizable import ProteinVisualizable
from ._stats import MtcStatsViewer
Expand All @@ -12,6 +13,7 @@
'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable',
'DiseaseViewable',
'MtcStatsViewer',
'summarize_hpo_analysis',
'VariantTranscriptVisualizer',
'VariantFormatter',
]
Loading

0 comments on commit 51c439c

Please sign in to comment.