From 4564ad0dbcc98578a0916c0ed15e7e3cff2aae56 Mon Sep 17 00:00:00 2001 From: "Moritz E. Beber" Date: Sat, 6 Jan 2024 17:09:21 +0100 Subject: [PATCH] refactor: replace goodtables with pandera --- setup.cfg | 3 +- src/memote/experimental/checks.py | 80 ------------------- src/memote/experimental/essentiality.py | 62 +++++++------- src/memote/experimental/experimental_base.py | 38 +-------- src/memote/experimental/growth.py | 55 ++++++++----- src/memote/experimental/medium.py | 46 +++++++---- .../experimental/schemata/essentiality.json | 47 ----------- src/memote/experimental/schemata/growth.json | 57 ------------- src/memote/experimental/schemata/medium.json | 30 ------- .../experimental/schemata/production.json | 46 ----------- src/memote/experimental/tabular.py | 47 +++++++++-- 11 files changed, 142 insertions(+), 369 deletions(-) delete mode 100644 src/memote/experimental/checks.py delete mode 100644 src/memote/experimental/schemata/essentiality.json delete mode 100644 src/memote/experimental/schemata/growth.json delete mode 100644 src/memote/experimental/schemata/medium.json delete mode 100644 src/memote/experimental/schemata/production.json diff --git a/setup.cfg b/setup.cfg index 5c6ad6019..d9ab3b2f0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,8 @@ install_requires = requests numpydoc pylru - goodtables ~=2.0 + pandera ~=0.18 + jsonschema ~=4.20 depinfo ~=2.2 requests python_requires = >=3.6 diff --git a/src/memote/experimental/checks.py b/src/memote/experimental/checks.py deleted file mode 100644 index e1d2c77c9..000000000 --- a/src/memote/experimental/checks.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability, -# Technical University of Denmark. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Custom checks for `goodtables`.""" - -from __future__ import absolute_import - -from goodtables import Error, check - - -@check("unknown-identifier", type="custom", context="body") -class UnknownIdentifier: - """ - Validate data identifiers against a known set. - - Attributes - ---------- - column : str - The header of the data column to check. - identifiers : iterable of str - The known set of identifiers. - - """ - - def __init__(self, column, identifiers, **_): - """ - Initialize the custom identfier check. - - Parameters - ---------- - column : str - The header of the data column to check. - identifiers : iterable of str - The known set of identifiers. - - """ - self.column = column - self.identifiers = frozenset(identifiers) - - def check_row(self, cells): - """Check each row in the data table.""" - cell = None - for item in cells: - if item["header"] == self.column: - cell = item - break - - if cell is None: - error = Error( - "unknown-identifier", - row_number=cells[0]["row-number"], - message="Checking identifiers requires the column " - "'{column}' to exist.".format(column=self.column), - ) - return [error] - - value = cell.get("value") - if value not in self.identifiers: - error = Error( - "unknown-identifier", - cell, - message="Value '{value}' in column {header} on row " - "{row_number} is an unknown identifier.", - message_substitutions={"value": value}, - ) - return [error] diff --git a/src/memote/experimental/essentiality.py b/src/memote/experimental/essentiality.py index 2b7c05a1d..8e02de718 100644 --- a/src/memote/experimental/essentiality.py +++ b/src/memote/experimental/essentiality.py @@ -20,8 +20,11 @@ from __future__ import absolute_import import logging +from typing import Optional +import pandera as pa from cobra.flux_analysis import single_gene_deletion +from pandera.typing import Series from memote.experimental.experiment import Experiment @@ -31,11 +34,31 @@ LOGGER = logging.getLogger(__name__) +class EssentialityExperimentModel(pa.DataFrameModel): + gene: Series[str] = pa.Field( + title="Gene Identifier", + description="The gene identifier must correspond to the metabolic model " + "identifiers.", + unique=True, + ) + essential: Series[bool] = pa.Field( + title="Gene Essentiality", + description="Whether a gene is (conditionally) essential.", + ) + comment: Optional[Series[str]] = pa.Field( + nullable=True, + title="Comment", + description="Optional comment which is not processed further.", + ) + + class Config: + coerce = True + strict = "filter" + + class EssentialityExperiment(Experiment): """Represent an essentiality experiment.""" - SCHEMA = "essentiality.json" - def __init__(self, **kwargs): """ Initialize an essentiality experiment. @@ -47,39 +70,10 @@ def __init__(self, **kwargs): """ super(EssentialityExperiment, self).__init__(**kwargs) - def load(self, dtype_conversion=None): - """ - Load the data table and corresponding validation schema. - - Parameters - ---------- - dtype_conversion : dict - Column names as keys and corresponding type for loading the data. - Please take a look at the `pandas documentation - `__ - for detailed explanations. - - """ - if dtype_conversion is None: - dtype_conversion = {"essential": str} - super(EssentialityExperiment, self).load(dtype_conversion=dtype_conversion) - self.data["essential"] = self.data["essential"].isin(self.TRUTHY) - def validate(self, model, checks=None): - """Use a defined schema to validate the medium table format.""" - if checks is None: - checks = [] - custom = [ - { - "unknown-identifier": { - "column": "gene", - "identifiers": {g.id for g in model.genes}, - } - } - ] - super(EssentialityExperiment, self).validate( - model=model, checks=checks + custom - ) + """Use a defined schema to validate the essentiality table format.""" + EssentialityExperimentModel.validate(self.data, lazy=True) + assert self.data["gene"].isin({g.id for g in model.genes}).all() def evaluate(self, model): """Use the defined parameters to predict single gene essentiality.""" diff --git a/src/memote/experimental/experimental_base.py b/src/memote/experimental/experimental_base.py index 24a9ac5b4..af593da60 100644 --- a/src/memote/experimental/experimental_base.py +++ b/src/memote/experimental/experimental_base.py @@ -17,20 +17,8 @@ """Provide a class for medium definitions.""" -import json import logging - -try: - from importlib.resources import files -except ImportError: - from importlib_resources import files - -from goodtables import validate - -# Importing the checks is necessary in order to register them. -import memote.experimental.schemata -from memote.experimental.checks import UnknownIdentifier # noqa: F401 from memote.experimental.tabular import read_tabular @@ -43,9 +31,6 @@ class ExperimentalBase(object): """Represent a specific medium condition.""" - SCHEMA = None - TRUTHY = {"true", "True", "TRUE", "1", "yes", "Yes", "YES"} - def __init__(self, identifier, obj, filename, **kwargs): """ Initialize a medium. @@ -66,11 +51,10 @@ def __init__(self, identifier, obj, filename, **kwargs): self.label = "" self.filename = filename self.data = None - self.schema = None def load(self, dtype_conversion=None): """ - Load the data table and corresponding validation schema. + Load the data table. Parameters ---------- @@ -82,26 +66,10 @@ def load(self, dtype_conversion=None): """ self.data = read_tabular(self.filename, dtype_conversion) - with files(memote.experimental.schemata).joinpath(self.SCHEMA).open( - mode="r", encoding="utf-8" - ) as file_handle: - self.schema = json.load(file_handle) - def validate(self, model, checks=None): + def validate(self, model): """Use a defined schema to validate the given table.""" - if checks is None: - checks = [] - records = self.data.to_dict("records") - self.evaluate_report( - validate( - records, - headers=list(records[0]), - preset="table", - schema=self.schema, - order_fields=True, - checks=checks, - ) - ) + NotImplementedError("Base class does not implement this method.") @staticmethod def evaluate_report(report): diff --git a/src/memote/experimental/growth.py b/src/memote/experimental/growth.py index b804080aa..45e922db4 100644 --- a/src/memote/experimental/growth.py +++ b/src/memote/experimental/growth.py @@ -20,8 +20,11 @@ from __future__ import absolute_import import logging +from typing import Optional +import pandera as pa from pandas import DataFrame +from pandera.typing import Series from memote.experimental.experiment import Experiment @@ -31,11 +34,39 @@ LOGGER = logging.getLogger(__name__) +class GrowthExperimentModel(pa.DataFrameModel): + exchange: Series[str] = pa.Field( + description="The exchange reaction identifier of the variable medium " + "component. Typically, this is a carbon source which will be added to a " + "configured base medium.", + title="Exchange Reaction Identifier", + ) + uptake: Series[float] = pa.Field( + ge=0.0, + le=1000.0, + title="Uptake Rate", + description="The uptake rate for the exchange reaction. For models following " + "common practices this modifies the lower bound.", + ) + growth: Series[bool] = pa.Field( + title="Growth", + description="A binary indicator whether growth was observed according to the " + "processed biolog data.", + ) + comment: Optional[Series[str]] = pa.Field( + nullable=True, + title="Comment", + description="Optional comment which is not processed further.", + ) + + class Config: + coerce = True + strict = "filter" + + class GrowthExperiment(Experiment): """Represent a growth experiment.""" - SCHEMA = "growth.json" - def __init__(self, **kwargs): """ Initialize a growth experiment. @@ -47,23 +78,9 @@ def __init__(self, **kwargs): """ super(GrowthExperiment, self).__init__(**kwargs) - def load(self, dtype_conversion=None): - """ - Load the data table and corresponding validation schema. - - Parameters - ---------- - dtype_conversion : dict - Column names as keys and corresponding type for loading the data. - Please take a look at the `pandas documentation - `__ - for detailed explanations. - - """ - if dtype_conversion is None: - dtype_conversion = {"growth": str} - super(GrowthExperiment, self).load(dtype_conversion=dtype_conversion) - self.data["growth"] = self.data["growth"].isin(self.TRUTHY) + def validate(self, model): + """Use a defined schema to validate the growth table format.""" + GrowthExperimentModel.validate(self.data, lazy=True) def evaluate(self, model): """Evaluate in silico growth rates.""" diff --git a/src/memote/experimental/medium.py b/src/memote/experimental/medium.py index d162d9edd..1a6551598 100644 --- a/src/memote/experimental/medium.py +++ b/src/memote/experimental/medium.py @@ -20,6 +20,10 @@ from __future__ import absolute_import import logging +from typing import Optional + +import pandera as pa +from pandera.typing import Series from memote.experimental.experimental_base import ExperimentalBase @@ -29,11 +33,34 @@ LOGGER = logging.getLogger(__name__) +class MediumModel(pa.DataFrameModel): + exchange: Series[str] = pa.Field( + description="The exchange reaction identifiers must correspond to the " + "metabolic model identifiers.", + title="Exchange Reaction Identifier", + unique=True, + ) + uptake: Series[float] = pa.Field( + ge=0.0, + le=1000.0, + title="Uptake Rate", + description="The uptake rate for the exchange reaction. For models following " + "common practices this modifies the lower bound.", + ) + comment: Optional[Series[str]] = pa.Field( + nullable=True, + title="Comment", + description="Optional comment which is not processed further.", + ) + + class Config: + coerce = True + strict = "filter" + + class Medium(ExperimentalBase): """Represent a specific medium condition.""" - SCHEMA = "medium.json" - def __init__(self, **kwargs): """ Initialize a medium. @@ -45,19 +72,10 @@ def __init__(self, **kwargs): """ super(Medium, self).__init__(**kwargs) - def validate(self, model, checks=None): + def validate(self, model): """Use a defined schema to validate the medium table format.""" - if checks is None: - checks = [] - custom = [ - { - "unknown-identifier": { - "column": "exchange", - "identifiers": {r.id for r in model.reactions}, - } - } - ] - super(Medium, self).validate(model=model, checks=checks + custom) + MediumModel.validate(self.data, lazy=True) + assert self.data["exchange"].isin({r.id for r in model.reactions}).all() def apply(self, model): """Set the defined medium on the given model.""" diff --git a/src/memote/experimental/schemata/essentiality.json b/src/memote/experimental/schemata/essentiality.json deleted file mode 100644 index ba5fb8cd3..000000000 --- a/src/memote/experimental/schemata/essentiality.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "fields": [ - { - "name": "gene", - "title": "Gene Identifier", - "description": "The gene identifier must correspond to the metabolic model identifiers.", - "type": "string", - "constraints": { - "required": true, - "unique": true - } - }, - { - "name": "essential", - "title": "Gene Essentiality", - "description": "Whether or not a gene is (conditionally) essential.", - "type": "boolean", - "constraints": { - "required": true - }, - "trueValues": [ - "true", - "True", - "TRUE", - "1", - "yes", - "Yes", - "YES" - ], - "falseValues": [ - "false", - "False", - "FALSE", - "0", - "no", - "No", - "NO" - ] - }, - { - "name": "comment", - "title": "Comment", - "description": "Optional comment which is not processed further.", - "type": "string" - } - ] -} diff --git a/src/memote/experimental/schemata/growth.json b/src/memote/experimental/schemata/growth.json deleted file mode 100644 index 5634a0fbc..000000000 --- a/src/memote/experimental/schemata/growth.json +++ /dev/null @@ -1,57 +0,0 @@ -{ - "fields": [ - { - "name": "exchange", - "title": "Exchange Reaction Identifier", - "description": "The exchange reaction identifier of the variable medium component. Typically, this is a carbon source which will be added to a configured base medium.", - "type": "string", - "constraints": { - "required": true - } - }, - { - "name": "uptake", - "title": "Uptake Rate", - "description": "The uptake rate for the exchange reaction. For models following common practices this modifies the lower bound.", - "type": "number", - "constraints": { - "required": true, - "minimum": 0, - "maximum": 1000 - } - }, - { - "name": "growth", - "title": "Growth", - "description": "A binary indicator whether growth was observed according to the processed biolog data.", - "type": "boolean", - "constraints": { - "required": true - }, - "trueValues": [ - "true", - "True", - "TRUE", - "1", - "yes", - "Yes", - "YES" - ], - "falseValues": [ - "false", - "False", - "FALSE", - "0", - "no", - "No", - "NO" - ] - }, - { - "name": "comment", - "title": "Comment", - "description": "Optional comment which is not processed further.", - "type": "string" - } - ] -} diff --git a/src/memote/experimental/schemata/medium.json b/src/memote/experimental/schemata/medium.json deleted file mode 100644 index 8cac2c357..000000000 --- a/src/memote/experimental/schemata/medium.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "fields": [ - { - "name": "exchange", - "title": "Exchange Reaction Identifier", - "description": "The exchange reaction identifiers must correspond to the metabolic model identifiers.", - "type": "string", - "constraints": { - "required": true - } - }, - { - "name": "uptake", - "title": "Uptake Rate", - "description": "The uptake rate for the exchange reaction. For models following common practices this modifies the lower bound.", - "type": "number", - "constraints": { - "required": true, - "minimum": 0, - "maximum": 1000 - } - }, - { - "name": "comment", - "title": "Comment", - "description": "Optional comment which is not processed further.", - "type": "string" - } - ] -} diff --git a/src/memote/experimental/schemata/production.json b/src/memote/experimental/schemata/production.json deleted file mode 100644 index a8b63b317..000000000 --- a/src/memote/experimental/schemata/production.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "fields": [ - { - "name": "compound", - "title": "Production Compound Identifier", - "description": "The product identifier must correspond with the metabolic model identifiers.", - "type": "string", - "constraints": { - "required": true - } - }, - { - "name": "production", - "title": "production Rate", - "description": "The measured production rate of the strain.", - "type": "number", - "constraints": { - "required": true, - "minimum": 0 - } - }, - { - "name": "growth", - "title": "Growth Rate", - "description": "The measured growth rate of the production strain.", - "type": "number", - "constraints": { - "required": true, - "minimum": 0, - "maximum": 10 - } - }, - { - "name": "medium", - "title": "Medium Composition", - "description": "The filename of the medium composition used if any.", - "type": "string" - }, - { - "name": "comment", - "title": "Comment", - "description": "Optional comment which is not processed further.", - "type": "string" - } - ] -} diff --git a/src/memote/experimental/tabular.py b/src/memote/experimental/tabular.py index 985d186f9..04fbdd873 100644 --- a/src/memote/experimental/tabular.py +++ b/src/memote/experimental/tabular.py @@ -22,7 +22,12 @@ import pandas as pd -def read_tabular(filename, dtype_conversion=None): +def read_tabular( + filename, + dtype_conversion=None, + truthy=("yes", "Yes", "YES"), + falsy=("no", "No", "NO"), +): """ Read a tabular data file which can be CSV, TSV, XLS or XLSX. @@ -49,15 +54,45 @@ def read_tabular(filename, dtype_conversion=None): # Completely empty columns are interpreted as float by default. dtype_conversion["comment"] = str if "csv" in ext: - df = pd.read_csv(filename, dtype=dtype_conversion, encoding="utf-8") + df = pd.read_csv( + filename, + dtype=dtype_conversion, + true_values=list(truthy), + false_values=list(falsy), + encoding="utf-8", + ) elif "tsv" in ext: - df = pd.read_table(filename, sep="\t", dtype=dtype_conversion, encoding="utf-8") + df = pd.read_table( + filename, + sep="\t", + dtype=dtype_conversion, + true_values=list(truthy), + false_values=list(falsy), + encoding="utf-8", + ) elif "xlsx" in ext: - df = pd.read_excel(filename, dtype=dtype_conversion, engine="openpyxl") + df = pd.read_excel( + filename, + dtype=dtype_conversion, + true_values=list(truthy), + false_values=list(falsy), + engine="openpyxl", + ) elif "xls" in ext: - df = pd.read_excel(filename, dtype=dtype_conversion, engine="xlrd") + df = pd.read_excel( + filename, + dtype=dtype_conversion, + true_values=list(truthy), + false_values=list(falsy), + engine="xlrd", + ) elif "ods" in ext: - df = pd.read_excel(filename, dtype=dtype_conversion) + df = pd.read_excel( + filename, + dtype=dtype_conversion, + true_values=list(truthy), + false_values=list(falsy), + ) else: raise ValueError("Unknown file format '{}'.".format(ext)) return df