Merge pull request #242 from monarch-initiative/tweak-pp-loading

Simplify phenopacket loading functions
monarch-initiative · Sep 4, 2024 · 532e611 · 532e611
2 parents 61e92ad + b522972
commit 532e611
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 23 deletions.
diff --git a/docs/user-guide/input-data.rst b/docs/user-guide/input-data.rst
@@ -5,9 +5,8 @@ Input data
 ==========
 
 The `gpsea` analysis needs to be provided with a standardized form of genotype and phenotype data.
-The analyses require an instance of :class:`gpsea.model.Cohort` that consists
-of :class:`gpsea.model.Patient`\ s - the cohort members. The cohort and the members
-hold the standardized data and provide convenience functions for dataset exploration.
+The analyses require an instance of :class:`~gpsea.model.Cohort` that consists
+of individuals in form of a :class:`~gpsea.model.Patient` class.
 
 .. seealso::
 
@@ -18,14 +17,21 @@ and performing functional annotation of the variants. Here we describe how to pr
 for the exploratory and downstream analysis.
 
 
+***************************************
 Create a cohort from GA4GH phenopackets
----------------------------------------
+***************************************
 
 The easiest way to input data into `gpsea` is to use the
 `GA4GH Phenopacket Schema <https://phenopacket-schema.readthedocs.io/en/latest>`_ phenopackets.
 `gpsea` provides an out-of-the-box solution for loading a cohort from a folder of phenopacket JSON files.
 
 
+Create cohort creator
+=====================
+
+Next, let's prepare a :class:`~gpsea.preprocessing.CohortCreator` that will turn a phenopacket collection
+into a :class:`~gpsea.model.Cohort`. The cohort creator also performs an input validation.
+The validation needs Human Phenotype Ontology data.
 Let's start with loading Human Phenotype Ontology, a requisite for the input Q/C steps. We'll use the amazing
 `hpo-toolkit <https://github.com/TheJacksonLaboratory/hpo-toolkit>`_ library which is installed along with
 the standard `gpsea` installation:
@@ -34,9 +40,6 @@ the standard `gpsea` installation:
 >>> store = hpotk.configure_ontology_store()
 >>> hpo = store.load_minimal_hpo(release='v2024-07-01')
 
-Next, let's prepare a :class:`~gpsea.preprocessing.CohortCreator` that will turn a collection of phenopacket
-into a :class:`~gpsea.model.Cohort`, required in the downstream steps.
-
 The easiest way to get the `CohortCreator` is to use the
 :func:`~gpsea.preprocessing.configure_caching_cohort_creator` convenience method:
 
@@ -53,7 +56,12 @@ The easiest way to get the `CohortCreator` is to use the
   and the responses will be cached in the current working directory to reduce the network bandwidth.
   See the :func:`~gpsea.preprocessing.configure_caching_cohort_creator` pydoc for more options.
 
-We can create a cohort starting from a `Phenopacket` collection.
+
+Load phenopackets
+=================
+
+We can create a cohort starting from a collection of `Phenopacket` objects
+provided by Python  `Phenopackets <https://pypi.org/project/phenopackets>`_ library.
 For the purpose of this example, we will load a cohort of patients with pathogenic mutations in *RERE* gene
 included in the release `0.1.18` of `Phenopacket Store <https://github.com/monarch-initiative/phenopacket-store>`_.
 We use `Phenopacket Store Toolkit <https://github.com/monarch-initiative/phenopacket-store-toolkit>`_
@@ -86,8 +94,22 @@ Validated under none policy
 No errors or warnings were found
 
 
-Create a cohort from other data
--------------------------------
+Alternative phenopacket sources
+===============================
+
+In case you do not already have a `Phenopacket` collection at your fingertips,
+GPSEA provides a few other convenience functions for loading phenopackets from JSON files.
+
+The :func:`~gpsea.preprocessing.load_phenopacket_files` function can be used to load
+a bunch of phenopacket JSON files:
+
+>>> from gpsea.preprocessing import load_phenopacket_files
+>>> pp_files = ('path/to/phenopacket1.json', 'path/to/phenopacket2.json')
+>>> cohort, qc_results = load_phenopacket_files(pp_files, cohort_creator)  # doctest: +SKIP
+
+or you can load an entire directory of JSON files with :func:`~gpsea.preprocessing.load_phenopacket_folder`:
 
-TODO - describe how to construct a Patient from raw HPO terms and variant coordinates.
+>>> from gpsea.preprocessing import load_phenopacket_folder
+>>> pp_dir = 'path/to/folder/with/many/phenopacket/json/files'
+>>> cohort, qc_results = load_phenopacket_folder(pp_dir, cohort_creator)  # doctest: +SKIP
 
diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py
@@ -2,7 +2,7 @@
 from ._api import VariantCoordinateFinder, FunctionalAnnotator, ImpreciseSvFunctionalAnnotator, ProteinMetadataService
 from ._audit import Auditor, DataSanityIssue, Level, Notepad, NotepadTree
 from ._config import configure_caching_patient_creator, configure_patient_creator
-from ._config import load_phenopacket_folder, load_phenopackets
+from ._config import load_phenopacket_folder, load_phenopacket_files, load_phenopackets
 from ._config import configure_caching_cohort_creator, configure_cohort_creator
 from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service
 from ._generic import DefaultImpreciseSvFunctionalAnnotator
@@ -21,7 +21,8 @@
     'configure_default_protein_metadata_service', 'configure_protein_metadata_service',
     'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService',
     'PatientCreator', 'CohortCreator',
-    'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopackets',
+    'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator',
+    'load_phenopacket_folder', 'load_phenopacket_files', 'load_phenopackets',
     'TranscriptCoordinateService', 'GeneCoordinateService',
     'PhenotypeCreator',
     'ProteinAnnotationCache', 'ProtCachingMetadataService',

diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py
@@ -383,20 +383,47 @@ def load_phenopacket_folder(
     """
     Load phenopacket JSON files from a directory, validate the patient data, and assemble the patients into a cohort.
 
+    A file with `.json` suffix is considered to be a JSON file and all JSON files are assumed to be phenopackets.
+    Non-JSON files are ignored.
+
     :param pp_directory: path to a folder with phenopacket JSON files. An error is raised if the path does not point to
       a directory with at least one phenopacket.
     :param cohort_creator: cohort creator for turning a sequence of phenopacket
       into a :class:`~gpsea.model.Cohort`.
     :param validation_policy: a `str` with the validation policy.
       The value must be one of `{'none', 'lenient', 'strict'}`
-    :return: a tuple with the cohort and the preprocessing validation result.
+    :return: a tuple with the cohort and the validation result.
     """
     # Load phenopackets
-    phenopackets = _load_phenopacket_dir(pp_directory)
+    pp_files = _find_phenopacket_files(pp_directory)
 
     # Map to patients
+    return load_phenopacket_files(
+        pp_files=pp_files,
+        cohort_creator=cohort_creator,
+        validation_policy=validation_policy,
+    )
+
+
+def load_phenopacket_files(
+    pp_files: typing.Iterator[str],
+    cohort_creator: CohortCreator[Phenopacket],
+    validation_policy: typing.Literal["none", "lenient", "strict"] = "none",
+) -> typing.Tuple[Cohort, PreprocessingValidationResult]:
+    """
+    Load phenopacket JSON files, validate the data, and assemble into a :class:`~gpsea.model.Cohort`.
+    
+    Phenopackets are validated, assembled into a cohort, and the validation results are reported back.
+
+    :param pp_files: an iterator with paths to phenopacket JSON files.
+    :param cohort_creator: cohort creator for turning a phenopacket collection
+      into a :class:`~gpsea.model.Cohort`.
+    :param validation_policy: a `str` with the validation policy.
+      The value must be one of `{'none', 'lenient', 'strict'}`
+    :return: a tuple with the cohort and the validation result.
+    """
     return load_phenopackets(
-        phenopackets=phenopackets,
+        phenopackets=(_load_phenopacket(pp_file) for pp_file in pp_files),
         cohort_creator=cohort_creator,
         validation_policy=validation_policy,
     )
@@ -408,15 +435,17 @@ def load_phenopackets(
     validation_policy: typing.Literal["none", "lenient", "strict"] = "none",
 ) -> typing.Tuple[Cohort, PreprocessingValidationResult]:
     """
-    Map phenopacket JSON file into patient, validate the patient data, and assemble the patients into a cohort.
+    Validate the phenopackets and assemble into a :class:`~gpsea.model.Cohort`.
+    
+    The results of the validation are reported back.
 
     :param phenopackets: path to a folder with phenopacket JSON files. An error is raised if the path does not point to
       a directory with at least one phenopacket.
     :param cohort_creator: cohort creator for turning a sequence of phenopacket
       into a :class:`~gpsea.model.Cohort`.
     :param validation_policy: a `str` with the validation policy.
       The value must be one of `{'none', 'lenient', 'strict'}`
-    :return: a tuple with the cohort and the preprocessing validation result.
+    :return: a tuple with the cohort and the validation result.
     """
     # Check inputs before doing anything
     hpotk.util.validate_instance(cohort_creator, CohortCreator, "cohort_creator")
@@ -438,17 +467,16 @@ def load_phenopackets(
     return cohort, validation_result
 
 
-def _load_phenopacket_dir(
+def _find_phenopacket_files(
     pp_dir: str,
-) -> typing.Iterator[Phenopacket]:
+) -> typing.Iterator[str]:
     fpath_pp_abs = os.path.abspath(pp_dir)
     if not os.path.isdir(fpath_pp_abs):
         raise ValueError(f"`{fpath_pp_abs}` does not point to a directory")
-
+    
     for patient_file in os.listdir(pp_dir):
         if patient_file.endswith(".json"):
-            phenopacket_path = os.path.join(pp_dir, patient_file)
-            yield _load_phenopacket(phenopacket_path)
+            yield os.path.join(pp_dir, patient_file)
 
 
 def _load_phenopacket(phenopacket_path: str) -> Phenopacket: