Skip to content

Commit

Permalink
Merge pull request #57 from monarch-initiative/56-add-random-terms-to…
Browse files Browse the repository at this point in the history
…-disease-profile

56-add-random-terms-to-disease-profile
  • Loading branch information
yaseminbridges authored Jan 22, 2025
2 parents ae0a6b2 + 55339ee commit b37d3b4
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "phenotype2phenopacket"
version = "0.6.12"
version = "0.6.13"
description = ""
authors = ["Yasemin Bridges <[email protected]>"]
readme = "README.md"
Expand Down
27 changes: 25 additions & 2 deletions src/phenotype2phenopacket/create/create.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from typing import List

import polars as pl
from oaklib.implementations import ProntoImplementation
Expand All @@ -16,12 +17,29 @@
)


def _get_terms_for_randomisation(
human_phenotype_ontology: ProntoImplementation,
) -> List[str]:
"""
Get the terms for randomisation.
Returns:
Set[str]: A set of terms for randomisation.
"""
descendents = set(human_phenotype_ontology.descendants("HP:0000118"))
descendents.discard("HP:0000118")
direct_children = {
term[1] for term in set(human_phenotype_ontology.incoming_relationships("HP:0000118"))
}
return list(descendents - direct_children)


def create_synthetic_patient_phenopacket(
human_phenotype_ontology: ProntoImplementation,
omim_disease: pl.DataFrame,
output_dir: Path,
pt_id: str,
hpoa_version: str,
random_terms: List[str],
):
"""
Create a synthetic patient phenopacket from a set of phenotype entries for a specific OMIM disease.
Expand All @@ -34,7 +52,9 @@ def create_synthetic_patient_phenopacket(
hpoa_version (str): The version of the Human Phenotype Ontology Annotation.
"""
synthetic_patient_generator = SyntheticPatientGenerator(omim_disease, human_phenotype_ontology)
synthetic_patient_generator = SyntheticPatientGenerator(
omim_disease, human_phenotype_ontology, random_terms
)
patient_terms = synthetic_patient_generator.patient_term_annotation_set()
phenopacket_file = PhenotypeAnnotationToPhenopacketConverter(
human_phenotype_ontology
Expand All @@ -45,7 +65,8 @@ def create_synthetic_patient_phenopacket(
onset=synthetic_patient_generator.get_onset_range(),
)
write_phenopacket(
phenopacket_file.phenopacket, output_dir.joinpath(phenopacket_file.phenopacket_path)
phenopacket_file.phenopacket,
output_dir.joinpath(phenopacket_file.phenopacket_path),
)


Expand All @@ -72,6 +93,7 @@ def create_synthetic_patients(
"""
phenotype_annotation_data = return_phenotype_annotation_data(phenotype_annotation)
human_phenotype_ontology = load_ontology(local_cached_ontology)
random_terms = _get_terms_for_randomisation(human_phenotype_ontology)
grouped_omim_diseases = filter_diseases(
num_disease, omim_id, omim_id_list, phenotype_annotation_data
)
Expand All @@ -89,4 +111,5 @@ def create_synthetic_patients(
output_dir,
None,
phenotype_annotation_data.version,
random_terms,
)
53 changes: 48 additions & 5 deletions src/phenotype2phenopacket/utils/phenopacket_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import secrets
import threading
import warnings
from copy import copy
from copy import copy, deepcopy
from dataclasses import dataclass
from fractions import Fraction
from pathlib import Path
Expand Down Expand Up @@ -149,7 +149,12 @@ def write_phenopacket(phenopacket: Phenopacket, output_file: Path) -> None:
class SyntheticPatientGenerator:
"""Class for generating synthetic patients."""

def __init__(self, disease_df: pl.DataFrame, ontology: ProntoImplementation):
def __init__(
self,
disease_df: pl.DataFrame,
ontology: ProntoImplementation,
random_terms: List[str] = None,
):
"""
Initialise the SyntheticPatientGenerator class
Expand All @@ -164,6 +169,7 @@ def __init__(self, disease_df: pl.DataFrame, ontology: ProntoImplementation):
self.upper_age = 0
self.filtered_df = []
self.secret_rand = secrets.SystemRandom()
self.random_terms = random_terms

def get_number_of_terms(self) -> int:
"""
Expand Down Expand Up @@ -486,8 +492,8 @@ def get_parents_of_terms(self, phenotype_entry: dict, steps: int) -> dict:
term = "".join(rels[(list(rels.keys())[0])]) if rels else ""
if (
term.startswith("Abnormality of")
or term_id == "HP:0000118"
or term_id == "HP:0032443"
or parent == "HP:0000118"
or parent == "HP:0032443"
):
break
else:
Expand Down Expand Up @@ -545,6 +551,40 @@ def alter_term_specificity(
)
return new_phenotype_terms

def calculate_number_of_randomised_terms(self, num_terms: int) -> int:
"""
Calculate the number of terms to randomise.
Args:
num_terms: The total number of terms.
Returns:
int: Number of terms to randomise.
"""
return round(num_terms * 0.33 * self.secret_rand.uniform(0, 1))

def randomised_terms(self, num_terms: int, template_dict: dict) -> List[dict]:
"""
Create randomised terms from a template.
Args:
num_terms: The total number of terms.
template_dict: The template dictionary.
Returns:
List[dict]: List of randomised terms.
"""
num_terms_to_randomise = self.calculate_number_of_randomised_terms(num_terms)
random_terms_subset = self.secret_rand.choices(self.random_terms, k=num_terms_to_randomise)
return [
{
**deepcopy(template_dict),
"hpo_id": random_term,
"modifier": None,
"sex": None,
"onset": None,
}
for random_term in random_terms_subset
]

def patient_term_annotation_set(self) -> pl.DataFrame:
"""
Get the final patient term annotation set.
Expand All @@ -569,7 +609,10 @@ def patient_term_annotation_set(self) -> pl.DataFrame:
patient_terms_filtered = self.remove_terms_to_be_randomised(
patient_terms, patient_terms_sub_sample
)
final_patient_terms = patient_terms_filtered.to_dicts() + new_phenotype_terms
altered_patient_terms = patient_terms_filtered.to_dicts() + new_phenotype_terms
final_patient_terms = altered_patient_terms + self.randomised_terms(
len(altered_patient_terms), altered_patient_terms[0]
)
return pl.from_dicts(final_patient_terms, infer_schema_length=len(final_patient_terms))


Expand Down

0 comments on commit b37d3b4

Please sign in to comment.