Skip to content

Commit

Permalink
refactor: replace goodtables with pandera
Browse files Browse the repository at this point in the history
  • Loading branch information
Midnighter committed Jan 8, 2024
1 parent dfe79f4 commit 4564ad0
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 369 deletions.
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ install_requires =
requests
numpydoc
pylru
goodtables ~=2.0
pandera ~=0.18
jsonschema ~=4.20
depinfo ~=2.2
requests
python_requires = >=3.6
Expand Down
80 changes: 0 additions & 80 deletions src/memote/experimental/checks.py

This file was deleted.

62 changes: 28 additions & 34 deletions src/memote/experimental/essentiality.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
from __future__ import absolute_import

import logging
from typing import Optional

import pandera as pa
from cobra.flux_analysis import single_gene_deletion
from pandera.typing import Series

from memote.experimental.experiment import Experiment

Expand All @@ -31,11 +34,31 @@
LOGGER = logging.getLogger(__name__)


class EssentialityExperimentModel(pa.DataFrameModel):
gene: Series[str] = pa.Field(
title="Gene Identifier",
description="The gene identifier must correspond to the metabolic model "
"identifiers.",
unique=True,
)
essential: Series[bool] = pa.Field(
title="Gene Essentiality",
description="Whether a gene is (conditionally) essential.",
)
comment: Optional[Series[str]] = pa.Field(
nullable=True,
title="Comment",
description="Optional comment which is not processed further.",
)

class Config:
coerce = True
strict = "filter"


class EssentialityExperiment(Experiment):
"""Represent an essentiality experiment."""

SCHEMA = "essentiality.json"

def __init__(self, **kwargs):
"""
Initialize an essentiality experiment.
Expand All @@ -47,39 +70,10 @@ def __init__(self, **kwargs):
"""
super(EssentialityExperiment, self).__init__(**kwargs)

def load(self, dtype_conversion=None):
"""
Load the data table and corresponding validation schema.
Parameters
----------
dtype_conversion : dict
Column names as keys and corresponding type for loading the data.
Please take a look at the `pandas documentation
<https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
for detailed explanations.
"""
if dtype_conversion is None:
dtype_conversion = {"essential": str}
super(EssentialityExperiment, self).load(dtype_conversion=dtype_conversion)
self.data["essential"] = self.data["essential"].isin(self.TRUTHY)

def validate(self, model, checks=None):
"""Use a defined schema to validate the medium table format."""
if checks is None:
checks = []
custom = [
{
"unknown-identifier": {
"column": "gene",
"identifiers": {g.id for g in model.genes},
}
}
]
super(EssentialityExperiment, self).validate(
model=model, checks=checks + custom
)
"""Use a defined schema to validate the essentiality table format."""
EssentialityExperimentModel.validate(self.data, lazy=True)
assert self.data["gene"].isin({g.id for g in model.genes}).all()

def evaluate(self, model):
"""Use the defined parameters to predict single gene essentiality."""
Expand Down
38 changes: 3 additions & 35 deletions src/memote/experimental/experimental_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,8 @@
"""Provide a class for medium definitions."""


import json
import logging


try:
from importlib.resources import files
except ImportError:
from importlib_resources import files

from goodtables import validate

# Importing the checks is necessary in order to register them.
import memote.experimental.schemata
from memote.experimental.checks import UnknownIdentifier # noqa: F401
from memote.experimental.tabular import read_tabular


Expand All @@ -43,9 +31,6 @@
class ExperimentalBase(object):
"""Represent a specific medium condition."""

SCHEMA = None
TRUTHY = {"true", "True", "TRUE", "1", "yes", "Yes", "YES"}

def __init__(self, identifier, obj, filename, **kwargs):
"""
Initialize a medium.
Expand All @@ -66,11 +51,10 @@ def __init__(self, identifier, obj, filename, **kwargs):
self.label = ""
self.filename = filename
self.data = None
self.schema = None

def load(self, dtype_conversion=None):
"""
Load the data table and corresponding validation schema.
Load the data table.
Parameters
----------
Expand All @@ -82,26 +66,10 @@ def load(self, dtype_conversion=None):
"""
self.data = read_tabular(self.filename, dtype_conversion)
with files(memote.experimental.schemata).joinpath(self.SCHEMA).open(
mode="r", encoding="utf-8"
) as file_handle:
self.schema = json.load(file_handle)

def validate(self, model, checks=None):
def validate(self, model):
"""Use a defined schema to validate the given table."""
if checks is None:
checks = []
records = self.data.to_dict("records")
self.evaluate_report(
validate(
records,
headers=list(records[0]),
preset="table",
schema=self.schema,
order_fields=True,
checks=checks,
)
)
NotImplementedError("Base class does not implement this method.")

@staticmethod
def evaluate_report(report):
Expand Down
55 changes: 36 additions & 19 deletions src/memote/experimental/growth.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
from __future__ import absolute_import

import logging
from typing import Optional

import pandera as pa
from pandas import DataFrame
from pandera.typing import Series

from memote.experimental.experiment import Experiment

Expand All @@ -31,11 +34,39 @@
LOGGER = logging.getLogger(__name__)


class GrowthExperimentModel(pa.DataFrameModel):
exchange: Series[str] = pa.Field(
description="The exchange reaction identifier of the variable medium "
"component. Typically, this is a carbon source which will be added to a "
"configured base medium.",
title="Exchange Reaction Identifier",
)
uptake: Series[float] = pa.Field(
ge=0.0,
le=1000.0,
title="Uptake Rate",
description="The uptake rate for the exchange reaction. For models following "
"common practices this modifies the lower bound.",
)
growth: Series[bool] = pa.Field(
title="Growth",
description="A binary indicator whether growth was observed according to the "
"processed biolog data.",
)
comment: Optional[Series[str]] = pa.Field(
nullable=True,
title="Comment",
description="Optional comment which is not processed further.",
)

class Config:
coerce = True
strict = "filter"


class GrowthExperiment(Experiment):
"""Represent a growth experiment."""

SCHEMA = "growth.json"

def __init__(self, **kwargs):
"""
Initialize a growth experiment.
Expand All @@ -47,23 +78,9 @@ def __init__(self, **kwargs):
"""
super(GrowthExperiment, self).__init__(**kwargs)

def load(self, dtype_conversion=None):
"""
Load the data table and corresponding validation schema.
Parameters
----------
dtype_conversion : dict
Column names as keys and corresponding type for loading the data.
Please take a look at the `pandas documentation
<https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
for detailed explanations.
"""
if dtype_conversion is None:
dtype_conversion = {"growth": str}
super(GrowthExperiment, self).load(dtype_conversion=dtype_conversion)
self.data["growth"] = self.data["growth"].isin(self.TRUTHY)
def validate(self, model):
"""Use a defined schema to validate the growth table format."""
GrowthExperimentModel.validate(self.data, lazy=True)

def evaluate(self, model):
"""Evaluate in silico growth rates."""
Expand Down
Loading

0 comments on commit 4564ad0

Please sign in to comment.