diff --git a/docs/user-guide/input-data.rst b/docs/user-guide/input-data.rst index f04132955..b94379b36 100644 --- a/docs/user-guide/input-data.rst +++ b/docs/user-guide/input-data.rst @@ -5,9 +5,8 @@ Input data ========== The `gpsea` analysis needs to be provided with a standardized form of genotype and phenotype data. -The analyses require an instance of :class:`gpsea.model.Cohort` that consists -of :class:`gpsea.model.Patient`\ s - the cohort members. The cohort and the members -hold the standardized data and provide convenience functions for dataset exploration. +The analyses require an instance of :class:`~gpsea.model.Cohort` that consists +of individuals in form of a :class:`~gpsea.model.Patient` class. .. seealso:: @@ -18,14 +17,21 @@ and performing functional annotation of the variants. Here we describe how to pr for the exploratory and downstream analysis. +*************************************** Create a cohort from GA4GH phenopackets ---------------------------------------- +*************************************** The easiest way to input data into `gpsea` is to use the `GA4GH Phenopacket Schema `_ phenopackets. `gpsea` provides an out-of-the-box solution for loading a cohort from a folder of phenopacket JSON files. +Create cohort creator +===================== + +Next, let's prepare a :class:`~gpsea.preprocessing.CohortCreator` that will turn a phenopacket collection +into a :class:`~gpsea.model.Cohort`. The cohort creator also performs an input validation. +The validation needs Human Phenotype Ontology data. Let's start with loading Human Phenotype Ontology, a requisite for the input Q/C steps. We'll use the amazing `hpo-toolkit `_ library which is installed along with the standard `gpsea` installation: @@ -34,9 +40,6 @@ the standard `gpsea` installation: >>> store = hpotk.configure_ontology_store() >>> hpo = store.load_minimal_hpo(release='v2024-07-01') -Next, let's prepare a :class:`~gpsea.preprocessing.CohortCreator` that will turn a collection of phenopacket -into a :class:`~gpsea.model.Cohort`, required in the downstream steps. - The easiest way to get the `CohortCreator` is to use the :func:`~gpsea.preprocessing.configure_caching_cohort_creator` convenience method: @@ -53,7 +56,12 @@ The easiest way to get the `CohortCreator` is to use the and the responses will be cached in the current working directory to reduce the network bandwidth. See the :func:`~gpsea.preprocessing.configure_caching_cohort_creator` pydoc for more options. -We can create a cohort starting from a `Phenopacket` collection. + +Load phenopackets +================= + +We can create a cohort starting from a collection of `Phenopacket` objects +provided by Python `Phenopackets `_ library. For the purpose of this example, we will load a cohort of patients with pathogenic mutations in *RERE* gene included in the release `0.1.18` of `Phenopacket Store `_. We use `Phenopacket Store Toolkit `_ @@ -86,8 +94,22 @@ Validated under none policy No errors or warnings were found -Create a cohort from other data -------------------------------- +Alternative phenopacket sources +=============================== + +In case you do not already have a `Phenopacket` collection at your fingertips, +GPSEA provides a few other convenience functions for loading phenopackets from JSON files. + +The :func:`~gpsea.preprocessing.load_phenopacket_files` function can be used to load +a bunch of phenopacket JSON files: + +>>> from gpsea.preprocessing import load_phenopacket_files +>>> pp_files = ('path/to/phenopacket1.json', 'path/to/phenopacket2.json') +>>> cohort, qc_results = load_phenopacket_files(pp_files, cohort_creator) # doctest: +SKIP + +or you can load an entire directory of JSON files with :func:`~gpsea.preprocessing.load_phenopacket_folder`: -TODO - describe how to construct a Patient from raw HPO terms and variant coordinates. +>>> from gpsea.preprocessing import load_phenopacket_folder +>>> pp_dir = 'path/to/folder/with/many/phenopacket/json/files' +>>> cohort, qc_results = load_phenopacket_folder(pp_dir, cohort_creator) # doctest: +SKIP diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py index 1551b4d63..6c9872b03 100644 --- a/src/gpsea/preprocessing/__init__.py +++ b/src/gpsea/preprocessing/__init__.py @@ -2,7 +2,7 @@ from ._api import VariantCoordinateFinder, FunctionalAnnotator, ImpreciseSvFunctionalAnnotator, ProteinMetadataService from ._audit import Auditor, DataSanityIssue, Level, Notepad, NotepadTree from ._config import configure_caching_patient_creator, configure_patient_creator -from ._config import load_phenopacket_folder, load_phenopackets +from ._config import load_phenopacket_folder, load_phenopacket_files, load_phenopackets from ._config import configure_caching_cohort_creator, configure_cohort_creator from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service from ._generic import DefaultImpreciseSvFunctionalAnnotator @@ -21,7 +21,8 @@ 'configure_default_protein_metadata_service', 'configure_protein_metadata_service', 'VariantCoordinateFinder', 'FunctionalAnnotator', 'ImpreciseSvFunctionalAnnotator', 'ProteinMetadataService', 'PatientCreator', 'CohortCreator', - 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', 'load_phenopacket_folder', 'load_phenopackets', + 'PhenopacketVariantCoordinateFinder', 'PhenopacketPatientCreator', + 'load_phenopacket_folder', 'load_phenopacket_files', 'load_phenopackets', 'TranscriptCoordinateService', 'GeneCoordinateService', 'PhenotypeCreator', 'ProteinAnnotationCache', 'ProtCachingMetadataService', diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 348bd5b1c..32d8b255c 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -383,20 +383,47 @@ def load_phenopacket_folder( """ Load phenopacket JSON files from a directory, validate the patient data, and assemble the patients into a cohort. + A file with `.json` suffix is considered to be a JSON file and all JSON files are assumed to be phenopackets. + Non-JSON files are ignored. + :param pp_directory: path to a folder with phenopacket JSON files. An error is raised if the path does not point to a directory with at least one phenopacket. :param cohort_creator: cohort creator for turning a sequence of phenopacket into a :class:`~gpsea.model.Cohort`. :param validation_policy: a `str` with the validation policy. The value must be one of `{'none', 'lenient', 'strict'}` - :return: a tuple with the cohort and the preprocessing validation result. + :return: a tuple with the cohort and the validation result. """ # Load phenopackets - phenopackets = _load_phenopacket_dir(pp_directory) + pp_files = _find_phenopacket_files(pp_directory) # Map to patients + return load_phenopacket_files( + pp_files=pp_files, + cohort_creator=cohort_creator, + validation_policy=validation_policy, + ) + + +def load_phenopacket_files( + pp_files: typing.Iterator[str], + cohort_creator: CohortCreator[Phenopacket], + validation_policy: typing.Literal["none", "lenient", "strict"] = "none", +) -> typing.Tuple[Cohort, PreprocessingValidationResult]: + """ + Load phenopacket JSON files, validate the data, and assemble into a :class:`~gpsea.model.Cohort`. + + Phenopackets are validated, assembled into a cohort, and the validation results are reported back. + + :param pp_files: an iterator with paths to phenopacket JSON files. + :param cohort_creator: cohort creator for turning a phenopacket collection + into a :class:`~gpsea.model.Cohort`. + :param validation_policy: a `str` with the validation policy. + The value must be one of `{'none', 'lenient', 'strict'}` + :return: a tuple with the cohort and the validation result. + """ return load_phenopackets( - phenopackets=phenopackets, + phenopackets=(_load_phenopacket(pp_file) for pp_file in pp_files), cohort_creator=cohort_creator, validation_policy=validation_policy, ) @@ -408,7 +435,9 @@ def load_phenopackets( validation_policy: typing.Literal["none", "lenient", "strict"] = "none", ) -> typing.Tuple[Cohort, PreprocessingValidationResult]: """ - Map phenopacket JSON file into patient, validate the patient data, and assemble the patients into a cohort. + Validate the phenopackets and assemble into a :class:`~gpsea.model.Cohort`. + + The results of the validation are reported back. :param phenopackets: path to a folder with phenopacket JSON files. An error is raised if the path does not point to a directory with at least one phenopacket. @@ -416,7 +445,7 @@ def load_phenopackets( into a :class:`~gpsea.model.Cohort`. :param validation_policy: a `str` with the validation policy. The value must be one of `{'none', 'lenient', 'strict'}` - :return: a tuple with the cohort and the preprocessing validation result. + :return: a tuple with the cohort and the validation result. """ # Check inputs before doing anything hpotk.util.validate_instance(cohort_creator, CohortCreator, "cohort_creator") @@ -438,17 +467,16 @@ def load_phenopackets( return cohort, validation_result -def _load_phenopacket_dir( +def _find_phenopacket_files( pp_dir: str, -) -> typing.Iterator[Phenopacket]: +) -> typing.Iterator[str]: fpath_pp_abs = os.path.abspath(pp_dir) if not os.path.isdir(fpath_pp_abs): raise ValueError(f"`{fpath_pp_abs}` does not point to a directory") - + for patient_file in os.listdir(pp_dir): if patient_file.endswith(".json"): - phenopacket_path = os.path.join(pp_dir, patient_file) - yield _load_phenopacket(phenopacket_path) + yield os.path.join(pp_dir, patient_file) def _load_phenopacket(phenopacket_path: str) -> Phenopacket: