diff --git a/pyproject.toml b/pyproject.toml index fe12aa6a3..68fda037b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ classifiers = [ ] dependencies = [ "hpo-toolkit>=0.3.0", + "stairval>=0.2.0", "Jinja2>=3.1.4,<4.0.0", "protobuf>=3.15.0", "pandas>=2.0.0,<3.0.0", diff --git a/src/gpsea/model/_cohort.py b/src/gpsea/model/_cohort.py index fba80f0c0..e6eecf3b1 100644 --- a/src/gpsea/model/_cohort.py +++ b/src/gpsea/model/_cohort.py @@ -267,12 +267,16 @@ def __hash__(self) -> int: class Cohort(typing.Sized, typing.Iterable[Patient]): + """ + Cohort is a collection of individuals that have been preprocessed + and are ready for genotype-phenotype association analysis. + """ @staticmethod def from_patients( - members: typing.Iterable[Patient], - include_patients_with_no_HPO: bool = False, - include_patients_with_no_variants: bool = False, + members: typing.Iterable[Patient], + include_patients_with_no_HPO: bool = False, + include_patients_with_no_variants: bool = False, ): """ Create a cohort from a sequence of patients. diff --git a/src/gpsea/preprocessing/__init__.py b/src/gpsea/preprocessing/__init__.py index 77be3f5f1..3c85d3e68 100644 --- a/src/gpsea/preprocessing/__init__.py +++ b/src/gpsea/preprocessing/__init__.py @@ -1,7 +1,6 @@ from ._api import PreprocessingValidationResult from ._api import TranscriptCoordinateService, GeneCoordinateService from ._api import VariantCoordinateFinder, FunctionalAnnotator, ImpreciseSvFunctionalAnnotator, ProteinMetadataService -from ._audit import Auditor, DataSanityIssue, Level, Notepad, NotepadTree from ._config import load_phenopacket_folder, load_phenopacket_files, load_phenopackets from ._config import configure_caching_cohort_creator, configure_cohort_creator from ._config import configure_default_protein_metadata_service, configure_protein_metadata_service @@ -27,6 +26,5 @@ 'UniprotProteinMetadataService', 'VepFunctionalAnnotator', 'VariantAnnotationCache', 'VarCachingFunctionalAnnotator', 'VVHgvsVariantCoordinateFinder', 'VVMultiCoordinateService', - 'Auditor', 'DataSanityIssue', 'Level', 'Notepad', 'NotepadTree', 'DefaultImpreciseSvFunctionalAnnotator', ] diff --git a/src/gpsea/preprocessing/_api.py b/src/gpsea/preprocessing/_api.py index f274cb814..fa7a2a7c2 100644 --- a/src/gpsea/preprocessing/_api.py +++ b/src/gpsea/preprocessing/_api.py @@ -1,9 +1,10 @@ import abc -import io import os import sys import typing +from stairval.notepad import Notepad + from gpsea.model import ( VariantCoordinates, ProteinMetadata, @@ -13,8 +14,6 @@ ImpreciseSvInfo, ) -from ._audit import NotepadTree - T = typing.TypeVar("T") @@ -142,7 +141,7 @@ class PreprocessingValidationResult: def __init__( self, policy: str, - notepad: NotepadTree, + notepad: Notepad, ): self._policy = policy self._notepad = notepad @@ -175,7 +174,7 @@ def is_ok(self) -> bool: def summarize( self, - file: io.TextIOBase = sys.stdout, + file: typing.TextIO = sys.stdout, indent: int = 2, ): """ @@ -189,13 +188,13 @@ def summarize( file.write(f"Validated under {self._policy} policy") file.write(os.linesep) - n_errors = sum(node.error_count() for node in self._notepad.iterate_nodes()) - n_warnings = sum(node.warning_count() for node in self._notepad.iterate_nodes()) + n_errors = sum(node.error_count() for node in self._notepad.iter_sections()) + n_warnings = sum(node.warning_count() for node in self._notepad.iter_sections()) if n_errors > 0 or n_warnings > 0: file.write("Showing errors and warnings") file.write(os.linesep) - for node in self._notepad.iterate_nodes(): + for node in self._notepad.iter_sections(): if node.has_errors_or_warnings(include_subsections=True): # We must report the node label even if there are no issues with the node. l_pad = " " * (node.level * indent) @@ -207,9 +206,9 @@ def summarize( file.write(os.linesep) for error in node.errors(): file.write( - l_pad + " " + error.message + f". {error.solution}" - if error.solution - else "" + l_pad + " ·" + error.message + ( + f". {error.solution}" if error.solution else "" + ) ) file.write(os.linesep) if node.has_warnings(): @@ -217,9 +216,9 @@ def summarize( file.write(os.linesep) for warning in node.warnings(): file.write( - l_pad + " ·" + warning.message + f". {warning.solution}" - if warning.solution - else "" + l_pad + " ·" + warning.message + ( + f". {warning.solution}" if warning.solution else "" + ) ) file.write(os.linesep) else: diff --git a/src/gpsea/preprocessing/_audit.py b/src/gpsea/preprocessing/_audit.py deleted file mode 100644 index 3bdafe97e..000000000 --- a/src/gpsea/preprocessing/_audit.py +++ /dev/null @@ -1,372 +0,0 @@ -import abc -import enum -import typing - - -class Level(enum.Enum): - """ - An enum to represent severity of the :class:`DataSanityIssue`. - """ - - WARN = enum.auto() - """ - Warning is an issue when something not entirely right. However, unlike :class:`Level.ERROR`, the analysis should - complete albeit with sub-optimal results 😧. - """ - - ERROR = enum.auto() - """ - Error is a serious issue in the input data and the downstream analysis may not complete or the analysis results - may be malarkey 😱. - """ - - def __str__(self): - return self.name - - -class DataSanityIssue: - """ - `DataSanityIssue` summarizes an issue found in the input data. - - The issue has a `level`, a `message` with human-friendly description, and an optional `solution` - for removing the issue. - """ - - def __init__(self, level: Level, - message: str, - solution: typing.Optional[str] = None): - self._level = level - self._message = message - self._solution = solution - - @property - def level(self) -> Level: - return self._level - - @property - def message(self) -> str: - return self._message - - @property - def solution(self) -> typing.Optional[str]: - return self._solution - - def __str__(self): - return f'DataSanityIssue(level={self._level}, message={self._message}, solution={self._solution})' - - def __repr__(self): - return str(self) - - -IN = typing.TypeVar('IN') -""" -:class:`Auditor` input. -""" - -OUT = typing.TypeVar('OUT') -""" -:class:`Auditor` output format. -""" - - -# TODO: remove -class AuditReport(typing.Generic[OUT]): - """ - `AuditReport` includes the issues found by :class:`Auditor` and the outcome of the sanitation. - """ - - def __init__(self, outcome: OUT, - issues: typing.Iterable[DataSanityIssue]): - self._outcome = outcome - self._issues = tuple(issues) - - @property - def outcome(self) -> OUT: - return self._outcome - - @property - def issues(self) -> typing.Sequence[DataSanityIssue]: - """ - Returns: - a sequence of :class:`DataSanityIssue`. - """ - return self._issues - - def warnings(self) -> typing.Iterator[DataSanityIssue]: - """ - Returns: - an iterator over the warnings. - """ - return filter(lambda i: i.level == Level.WARN, self.issues) - - def errors(self) -> typing.Iterator[DataSanityIssue]: - """ - Returns: - an iterator over the errors. - """ - return filter(lambda i: i.level == Level.ERROR, self.issues) - - def has_errors(self) -> bool: - """ - Returns: - bool: `True` if one or more errors were found in the input. - """ - for _ in self.errors(): - return True - return False - - def has_warnings_or_errors(self) -> bool: - """ - Returns: - bool: `True` if one or more errors or warnings were found in the input. - """ - for _ in self.warnings(): - return True - for _ in self.errors(): - return True - return False - - def n_warnings(self) -> int: - """ - Returns: - int: the number of warnings in the audit report. - """ - return sum(1 for issue in self.issues if issue.level == Level.WARN) - - def n_errors(self) -> int: - """ - Returns: - int: the number of errors in the audit report. - """ - return sum(1 for issue in self.issues if issue.level == Level.ERROR) - - def __str__(self): - return f'AuditReport(issues={self._issues}, outcome={self._outcome})' - - def __repr__(self): - return str(self) - - -class Notepad(metaclass=abc.ABCMeta): - """ - Record issues encountered during parsing/validation of a hierarchical data structure. - - The issues can be organized in sections. `Notepad` keeps track of issues in one section - and the subsections can be created by calling :func:`add_subsection`. The function returns - an instance responsible for issues of a subsection. - - A collection of the issues from the current section are available via :attr:`issues` property - and the convenience functions provide iterators over error and warnings. - """ - - def __init__(self, label: str): - self._label = label - self._issues: typing.MutableSequence[DataSanityIssue] = [] - - - @abc.abstractmethod - def add_subsection(self, label: str) -> "Notepad": - """ - Add a labeled subsection. - - Returns: - Notepad: a notepad for recording issues within the subsection. - """ - pass - - @property - def label(self) -> str: - """ - Get a `str` with the section label. - """ - return self._label - - @property - def issues(self) -> typing.Sequence[DataSanityIssue]: - """ - Get an iterable with the issues of the current section. - """ - return self._issues - - def add_issue(self, level: Level, message: str, solution: typing.Optional[str] = None): - """ - Add an issue with certain `level`, `message`, and an optional `solution`. - """ - self._issues.append(DataSanityIssue(level, message, solution)) - - def add_error(self, message: str, solution: typing.Optional[str] = None): - """ - A convenience function for adding an *error* with a `message` and an optional `solution`. - """ - self.add_issue(Level.ERROR, message, solution) - - def add_warning(self, message: str, solution: typing.Optional[str] = None): - """ - A convenience function for adding a *warning* with a `message` and an optional `solution`. - """ - self.add_issue(Level.WARN, message, solution) - - def errors(self) -> typing.Iterator[DataSanityIssue]: - """ - Iterate over the errors of the current section. - """ - return filter(lambda dsi: dsi.level == Level.ERROR, self.issues) - - def warnings(self) -> typing.Iterator[DataSanityIssue]: - """ - Iterate over the warnings of the current section. - """ - return filter(lambda dsi: dsi.level == Level.WARN, self.issues) - - def error_count(self) -> int: - """ - Returns: - int: count of errors found in this section. - """ - return sum(1 for _ in self.errors()) - - def warning_count(self) -> int: - """ - Returns: - int: count of warnings found in this section. - """ - return sum(1 for _ in self.warnings()) - - -class NotepadTree(Notepad): - """ - `NotepadTree` implements :class:`Notepad` using a tree where each tree node corresponds to a (sub)section. The node - can have `0..n` children. - - Each node has a :attr:`label`, a collection of issues, and children with subsections. For convenience, the node - has :attr:`level` to correspond to the depth of the node within the tree (the level of the root node is `0`). - - The nodes can be accessed via :attr:`children` property or through convenience methods for tree traversal, either - using the visitor pattern (:func:`visit`) or by iterating over the nodes via :func:`iterate_nodes`. In both cases, - the traversal is done in the depth-first fashion. - """ - - def __init__(self, label: str, level: int): - super().__init__(label) - self._level = level - self._children = [] - - @property - def children(self): - return self._children - - @property - def level(self) -> int: - return self._level - - def add_subsection(self, identifier: str): - sub = NotepadTree(identifier, self._level + 1) - self._children.append(sub) - return sub - - def visit(self, visitor): - """ - Perform a depth-first search on the tree and call `visitor` with all nodes. - Args: - visitor: a callable that takes the current node as a single argument. - """ - stack = [self] - - while stack: - node = stack.pop() - # Reversed to visit in the add order. - stack.extend(reversed(node.children)) - visitor(node) - - def iterate_nodes(self): - """ - Iterate over nodes in the depth-first fashion. - - Returns: a depth-first node iterator. - """ - stack = [self] - while stack: - node = stack.pop() - stack.extend(reversed(node.children)) - yield node - - def has_warnings(self, include_subsections: bool = False) -> bool: - """ - Returns: - bool: `True` if one or more warnings were found in the current section or its subsections. - """ - if include_subsections: - for node in self.iterate_nodes(): - for _ in node.warnings(): - return True - else: - for _ in self.warnings(): - return True - - return False - - def has_errors(self, include_subsections: bool = False) -> bool: - """ - Returns: - bool: `True` if one or more errors were found in the current section or its subsections. - """ - if include_subsections: - for node in self.iterate_nodes(): - for _ in node.errors(): - return True - else: - for _ in self.errors(): - return True - - return False - - def has_errors_or_warnings(self, include_subsections: bool = False) -> bool: - """ - Returns: - bool: `True` if one or more errors or warnings were found in the current section or its subsections. - """ - if include_subsections: - for node in self.iterate_nodes(): - for _ in node.warnings(): - return True - for _ in node.errors(): - return True - else: - for _ in self.warnings(): - return True - for _ in self.errors(): - return True - - return False - - def __str__(self): - return f'NotepadTree(label={self._label}, level={self._level}, children={[ch.identifier for ch in self._children]})' - - -class Auditor(typing.Generic[IN, OUT], metaclass=abc.ABCMeta): - """ - `Auditor` checks the inputs for sanity issues and relates the issues with sanitized inputs - as :class:`SanitationResults`. - - The auditor may sanitize the input as a matter of discretion and returns the input as `OUT`. - """ - - @staticmethod - def prepare_notepad(label: str) -> NotepadTree: - """ - Prepare a :class:`Notepad` for recording issues and errors. - - Args: - label: a `str` with the top-level section label. - - Returns: - NotepadTree: an instance of :class:`NotepadTree`. - """ - return NotepadTree(label, level=0) - - @abc.abstractmethod - def process(self, data: IN, notepad: Notepad) -> OUT: - """ - Audit and sanitize the `data`, record the issues to the `notepad` and return the sanitized data. - """ - pass diff --git a/src/gpsea/preprocessing/_config.py b/src/gpsea/preprocessing/_config.py index 8ff816b9e..99fdec3c7 100644 --- a/src/gpsea/preprocessing/_config.py +++ b/src/gpsea/preprocessing/_config.py @@ -11,6 +11,8 @@ PhenotypicAbnormalityValidator, ) +from stairval.notepad import create_notepad + # pyright: reportGeneralTypeIssues=false from google.protobuf.json_format import Parse from phenopackets import Phenopacket @@ -375,7 +377,7 @@ def load_phenopackets( cohort_iter = tqdm( phenopackets, desc="Individuals Processed", file=sys.stdout, unit="individuals" ) - notepad = cohort_creator.prepare_notepad("Phenopackets") + notepad = create_notepad(label="Phenopackets") cohort = cohort_creator.process(cohort_iter, notepad) validation_result = PreprocessingValidationResult( @@ -383,6 +385,7 @@ def load_phenopackets( notepad=notepad, ) + assert cohort is not None return cohort, validation_result diff --git a/src/gpsea/preprocessing/_patient.py b/src/gpsea/preprocessing/_patient.py index 7a9ed95cc..7f51ed50c 100644 --- a/src/gpsea/preprocessing/_patient.py +++ b/src/gpsea/preprocessing/_patient.py @@ -2,12 +2,10 @@ import typing -import hpotk +from stairval.notepad import Notepad from gpsea.model import Patient, Cohort -from ._audit import Auditor, Notepad - T = typing.TypeVar('T') """ The input for `PatientCreator`. @@ -16,27 +14,40 @@ """ -class PatientCreator(typing.Generic[T], Auditor[T, Patient], metaclass=abc.ABCMeta): +class PatientCreator(typing.Generic[T], metaclass=abc.ABCMeta): """ `PatientCreator` can create a `Patient` from some input `T`. - - `PatientCreator` is an `Auditor`, hence the input is sanitized and any errors are reported to the caller. """ - pass + + @abc.abstractmethod + def process( + self, + item: T, + notepad: Notepad, + ) -> typing.Optional[Patient]: + pass -class CohortCreator(typing.Generic[T], Auditor[typing.Iterable[T], Cohort]): +class CohortCreator(typing.Generic[T]): """ `CohortCreator` creates a cohort from an iterable of some `T` where `T` represents a cohort member. """ - def __init__(self, patient_creator: PatientCreator[T]): + def __init__( + self, + patient_creator: PatientCreator[T], + ): # Check that we're getting a `PatientCreator`. # Unfortunately, we cannot check that `T`s of `PatientCreator` and `CohortCreator` actually match # due to Python's loosey-goosey nature. - self._pc = hpotk.util.validate_instance(patient_creator, PatientCreator, 'patient_creator') - - def process(self, inputs: typing.Iterable[T], notepad: Notepad) -> Cohort: + assert isinstance(patient_creator, PatientCreator) + self._pc = patient_creator + + def process( + self, + inputs: typing.Iterable[T], + notepad: Notepad, + ) -> Cohort: patients = [] patient_labels = set() duplicate_pat_labels = set() @@ -44,21 +55,26 @@ def process(self, inputs: typing.Iterable[T], notepad: Notepad) -> Cohort: for i, pp in enumerate(inputs): sub = notepad.add_subsection(f'patient #{i}') patient = self._pc.process(pp, sub) - if patient.labels in patient_labels: - duplicate_pat_labels.add(patient.labels) - patient_labels.add(patient.labels) - patients.append(patient) + if patient is not None: + if patient.labels in patient_labels: + duplicate_pat_labels.add(patient.labels) + patient_labels.add(patient.labels) + patients.append(patient) # What happens if a sample has if len(duplicate_pat_labels) > 0: label_summaries = [d.label_summary() for d in duplicate_pat_labels] label_summaries.sort() - notepad.add_error(f"Patient ID/s {', '.join(label_summaries)} have a duplicate", - "Please verify every patient has an unique ID.") + notepad.add_error( + f"Patient ID/s {', '.join(label_summaries)} have a duplicate", + "Please verify every patient has an unique ID.", + ) # We should have >1 patients in the cohort, right? if len(patients) <= 1: - notepad.add_warning(f'Cohort must include {len(patients)}>1 members', - 'Fix issues in patients to enable the analysis') + notepad.add_error( + f'Cohort must include {len(patients)}>1 members', + 'Fix issues in patients to enable the analysis', + ) return Cohort.from_patients(patients) diff --git a/src/gpsea/preprocessing/_phenopacket.py b/src/gpsea/preprocessing/_phenopacket.py index 68d38c51c..3c7486c64 100644 --- a/src/gpsea/preprocessing/_phenopacket.py +++ b/src/gpsea/preprocessing/_phenopacket.py @@ -5,6 +5,9 @@ from hpotk.util import validate_instance from hpotk.validate import ValidationRunner, ValidationLevel +from stairval import Level +from stairval.notepad import Notepad + import phenopackets.schema.v2.core.individual_pb2 as ppi from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket from phenopackets.schema.v2.core.base_pb2 import TimeElement as PPTimeElement @@ -35,7 +38,6 @@ FunctionalAnnotator, ImpreciseSvFunctionalAnnotator, ) -from ._audit import Notepad, Level from ._patient import PatientCreator @@ -285,7 +287,11 @@ def __init__( "1000044", # chromosomal_translocation } - def process(self, pp: Phenopacket, notepad: Notepad) -> Patient: + def process( + self, + pp: Phenopacket, + notepad: Notepad, + ) -> typing.Optional[Patient]: """Creates a Patient from the data in a given Phenopacket Args: diff --git a/tests/preprocessing/conftest.py b/tests/preprocessing/conftest.py index 28b6dca64..5ff2883b1 100644 --- a/tests/preprocessing/conftest.py +++ b/tests/preprocessing/conftest.py @@ -2,6 +2,7 @@ import pytest + @pytest.fixture(scope='session') def fpath_preprocessing_data_dir() -> str: parent = os.path.dirname(__file__) diff --git a/tests/preprocessing/test_patient_and_cohort_creator.py b/tests/preprocessing/test_patient_and_cohort_creator.py index 0ac8fbdc1..f8cf918d3 100644 --- a/tests/preprocessing/test_patient_and_cohort_creator.py +++ b/tests/preprocessing/test_patient_and_cohort_creator.py @@ -102,7 +102,10 @@ def test_cohort_creator( outfile = io.StringIO() results.summarize(outfile) - actual_lines = outfile.getvalue().split(os.linesep) + actual_lines = outfile.getvalue().splitlines(keepends=False) - expected = " Patient ID/s Pat_1[PMID_12345], Pat_2[PMID_67890] have a duplicate. Please verify every patient has an unique ID." + expected = ( + " ·Patient ID/s Pat_1[PMID_12345], Pat_2[PMID_67890] have a duplicate. " + "Please verify every patient has an unique ID." + ) assert expected in actual_lines diff --git a/tests/preprocessing/test_phenopacket.py b/tests/preprocessing/test_phenopacket.py index f46301865..6c1a2e27f 100644 --- a/tests/preprocessing/test_phenopacket.py +++ b/tests/preprocessing/test_phenopacket.py @@ -3,6 +3,9 @@ import hpotk import pytest +from stairval import Level +from stairval.notepad import create_notepad + from google.protobuf.json_format import Parse from phenopackets.schema.v2.core.interpretation_pb2 import GenomicInterpretation from phenopackets.schema.v2.phenopackets_pb2 import Phenopacket @@ -21,7 +24,6 @@ ImpreciseSvFunctionalAnnotator, DefaultImpreciseSvFunctionalAnnotator, ) -from gpsea.preprocessing import Level from gpsea.preprocessing import PhenopacketPatientCreator from gpsea.preprocessing import VVMultiCoordinateService @@ -208,8 +210,10 @@ def test_phenopacket_patient_creator( phenopacket: Phenopacket, patient_creator: PhenopacketPatientCreator, ): - notepad = patient_creator.prepare_notepad("A phenopacket") + notepad = create_notepad("A phenopacket") patient = patient_creator.process(phenopacket, notepad) + + assert patient is not None # No issues assert not notepad.has_errors_or_warnings(include_subsections=True) @@ -295,7 +299,7 @@ def test_individual_with_no_genotype( pp.CopyFrom(phenopacket) del pp.interpretations[:] # clear variants - notepad = patient_creator.prepare_notepad("no-gt") + notepad = create_notepad("no-gt") _ = patient_creator.process(pp=pp, notepad=notepad) @@ -322,7 +326,7 @@ def test_individual_with_no_phenotype( del pp.diseases[:] del pp.measurements[:] - notepad = patient_creator.prepare_notepad("no-gt") + notepad = create_notepad("no-gt") _ = patient_creator.process(pp=pp, notepad=notepad)