From 4564ad0dbcc98578a0916c0ed15e7e3cff2aae56 Mon Sep 17 00:00:00 2001
From: "Moritz E. Beber" <midnighter@posteo.net>
Date: Sat, 6 Jan 2024 17:09:21 +0100
Subject: [PATCH] refactor: replace goodtables with pandera

---
 setup.cfg                                     |  3 +-
 src/memote/experimental/checks.py             | 80 -------------------
 src/memote/experimental/essentiality.py       | 62 +++++++-------
 src/memote/experimental/experimental_base.py  | 38 +--------
 src/memote/experimental/growth.py             | 55 ++++++++-----
 src/memote/experimental/medium.py             | 46 +++++++----
 .../experimental/schemata/essentiality.json   | 47 -----------
 src/memote/experimental/schemata/growth.json  | 57 -------------
 src/memote/experimental/schemata/medium.json  | 30 -------
 .../experimental/schemata/production.json     | 46 -----------
 src/memote/experimental/tabular.py            | 47 +++++++++--
 11 files changed, 142 insertions(+), 369 deletions(-)
 delete mode 100644 src/memote/experimental/checks.py
 delete mode 100644 src/memote/experimental/schemata/essentiality.json
 delete mode 100644 src/memote/experimental/schemata/growth.json
 delete mode 100644 src/memote/experimental/schemata/medium.json
 delete mode 100644 src/memote/experimental/schemata/production.json

diff --git a/setup.cfg b/setup.cfg
index 5c6ad6019..d9ab3b2f0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -56,7 +56,8 @@ install_requires =
 	requests
 	numpydoc
 	pylru
-	goodtables ~=2.0
+    pandera ~=0.18
+    jsonschema ~=4.20
 	depinfo ~=2.2
 	requests
 python_requires = >=3.6
diff --git a/src/memote/experimental/checks.py b/src/memote/experimental/checks.py
deleted file mode 100644
index e1d2c77c9..000000000
--- a/src/memote/experimental/checks.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Novo Nordisk Foundation Center for Biosustainability,
-# Technical University of Denmark.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Custom checks for `goodtables`."""
-
-from __future__ import absolute_import
-
-from goodtables import Error, check
-
-
-@check("unknown-identifier", type="custom", context="body")
-class UnknownIdentifier:
-    """
-    Validate data identifiers against a known set.
-
-    Attributes
-    ----------
-    column : str
-        The header of the data column to check.
-    identifiers : iterable of str
-        The known set of identifiers.
-
-    """
-
-    def __init__(self, column, identifiers, **_):
-        """
-        Initialize the custom identfier check.
-
-        Parameters
-        ----------
-        column : str
-            The header of the data column to check.
-        identifiers : iterable of str
-            The known set of identifiers.
-
-        """
-        self.column = column
-        self.identifiers = frozenset(identifiers)
-
-    def check_row(self, cells):
-        """Check each row in the data table."""
-        cell = None
-        for item in cells:
-            if item["header"] == self.column:
-                cell = item
-                break
-
-        if cell is None:
-            error = Error(
-                "unknown-identifier",
-                row_number=cells[0]["row-number"],
-                message="Checking identifiers requires the column "
-                "'{column}' to exist.".format(column=self.column),
-            )
-            return [error]
-
-        value = cell.get("value")
-        if value not in self.identifiers:
-            error = Error(
-                "unknown-identifier",
-                cell,
-                message="Value '{value}' in column {header} on row "
-                "{row_number} is an unknown identifier.",
-                message_substitutions={"value": value},
-            )
-            return [error]
diff --git a/src/memote/experimental/essentiality.py b/src/memote/experimental/essentiality.py
index 2b7c05a1d..8e02de718 100644
--- a/src/memote/experimental/essentiality.py
+++ b/src/memote/experimental/essentiality.py
@@ -20,8 +20,11 @@
 from __future__ import absolute_import
 
 import logging
+from typing import Optional
 
+import pandera as pa
 from cobra.flux_analysis import single_gene_deletion
+from pandera.typing import Series
 
 from memote.experimental.experiment import Experiment
 
@@ -31,11 +34,31 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class EssentialityExperimentModel(pa.DataFrameModel):
+    gene: Series[str] = pa.Field(
+        title="Gene Identifier",
+        description="The gene identifier must correspond to the metabolic model "
+        "identifiers.",
+        unique=True,
+    )
+    essential: Series[bool] = pa.Field(
+        title="Gene Essentiality",
+        description="Whether a gene is (conditionally) essential.",
+    )
+    comment: Optional[Series[str]] = pa.Field(
+        nullable=True,
+        title="Comment",
+        description="Optional comment which is not processed further.",
+    )
+
+    class Config:
+        coerce = True
+        strict = "filter"
+
+
 class EssentialityExperiment(Experiment):
     """Represent an essentiality experiment."""
 
-    SCHEMA = "essentiality.json"
-
     def __init__(self, **kwargs):
         """
         Initialize an essentiality experiment.
@@ -47,39 +70,10 @@ def __init__(self, **kwargs):
         """
         super(EssentialityExperiment, self).__init__(**kwargs)
 
-    def load(self, dtype_conversion=None):
-        """
-        Load the data table and corresponding validation schema.
-
-        Parameters
-        ----------
-        dtype_conversion : dict
-            Column names as keys and corresponding type for loading the data.
-            Please take a look at the `pandas documentation
-            <https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
-            for detailed explanations.
-
-        """
-        if dtype_conversion is None:
-            dtype_conversion = {"essential": str}
-        super(EssentialityExperiment, self).load(dtype_conversion=dtype_conversion)
-        self.data["essential"] = self.data["essential"].isin(self.TRUTHY)
-
     def validate(self, model, checks=None):
-        """Use a defined schema to validate the medium table format."""
-        if checks is None:
-            checks = []
-        custom = [
-            {
-                "unknown-identifier": {
-                    "column": "gene",
-                    "identifiers": {g.id for g in model.genes},
-                }
-            }
-        ]
-        super(EssentialityExperiment, self).validate(
-            model=model, checks=checks + custom
-        )
+        """Use a defined schema to validate the essentiality table format."""
+        EssentialityExperimentModel.validate(self.data, lazy=True)
+        assert self.data["gene"].isin({g.id for g in model.genes}).all()
 
     def evaluate(self, model):
         """Use the defined parameters to predict single gene essentiality."""
diff --git a/src/memote/experimental/experimental_base.py b/src/memote/experimental/experimental_base.py
index 24a9ac5b4..af593da60 100644
--- a/src/memote/experimental/experimental_base.py
+++ b/src/memote/experimental/experimental_base.py
@@ -17,20 +17,8 @@
 """Provide a class for medium definitions."""
 
 
-import json
 import logging
 
-
-try:
-    from importlib.resources import files
-except ImportError:
-    from importlib_resources import files
-
-from goodtables import validate
-
-# Importing the checks is necessary in order to register them.
-import memote.experimental.schemata
-from memote.experimental.checks import UnknownIdentifier  # noqa: F401
 from memote.experimental.tabular import read_tabular
 
 
@@ -43,9 +31,6 @@
 class ExperimentalBase(object):
     """Represent a specific medium condition."""
 
-    SCHEMA = None
-    TRUTHY = {"true", "True", "TRUE", "1", "yes", "Yes", "YES"}
-
     def __init__(self, identifier, obj, filename, **kwargs):
         """
         Initialize a medium.
@@ -66,11 +51,10 @@ def __init__(self, identifier, obj, filename, **kwargs):
             self.label = ""
         self.filename = filename
         self.data = None
-        self.schema = None
 
     def load(self, dtype_conversion=None):
         """
-        Load the data table and corresponding validation schema.
+        Load the data table.
 
         Parameters
         ----------
@@ -82,26 +66,10 @@ def load(self, dtype_conversion=None):
 
         """
         self.data = read_tabular(self.filename, dtype_conversion)
-        with files(memote.experimental.schemata).joinpath(self.SCHEMA).open(
-            mode="r", encoding="utf-8"
-        ) as file_handle:
-            self.schema = json.load(file_handle)
 
-    def validate(self, model, checks=None):
+    def validate(self, model):
         """Use a defined schema to validate the given table."""
-        if checks is None:
-            checks = []
-        records = self.data.to_dict("records")
-        self.evaluate_report(
-            validate(
-                records,
-                headers=list(records[0]),
-                preset="table",
-                schema=self.schema,
-                order_fields=True,
-                checks=checks,
-            )
-        )
+        NotImplementedError("Base class does not implement this method.")
 
     @staticmethod
     def evaluate_report(report):
diff --git a/src/memote/experimental/growth.py b/src/memote/experimental/growth.py
index b804080aa..45e922db4 100644
--- a/src/memote/experimental/growth.py
+++ b/src/memote/experimental/growth.py
@@ -20,8 +20,11 @@
 from __future__ import absolute_import
 
 import logging
+from typing import Optional
 
+import pandera as pa
 from pandas import DataFrame
+from pandera.typing import Series
 
 from memote.experimental.experiment import Experiment
 
@@ -31,11 +34,39 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class GrowthExperimentModel(pa.DataFrameModel):
+    exchange: Series[str] = pa.Field(
+        description="The exchange reaction identifier of the variable medium "
+        "component. Typically, this is a carbon source which will be added to a "
+        "configured base medium.",
+        title="Exchange Reaction Identifier",
+    )
+    uptake: Series[float] = pa.Field(
+        ge=0.0,
+        le=1000.0,
+        title="Uptake Rate",
+        description="The uptake rate for the exchange reaction. For models following "
+        "common practices this modifies the lower bound.",
+    )
+    growth: Series[bool] = pa.Field(
+        title="Growth",
+        description="A binary indicator whether growth was observed according to the "
+        "processed biolog data.",
+    )
+    comment: Optional[Series[str]] = pa.Field(
+        nullable=True,
+        title="Comment",
+        description="Optional comment which is not processed further.",
+    )
+
+    class Config:
+        coerce = True
+        strict = "filter"
+
+
 class GrowthExperiment(Experiment):
     """Represent a growth experiment."""
 
-    SCHEMA = "growth.json"
-
     def __init__(self, **kwargs):
         """
         Initialize a growth experiment.
@@ -47,23 +78,9 @@ def __init__(self, **kwargs):
         """
         super(GrowthExperiment, self).__init__(**kwargs)
 
-    def load(self, dtype_conversion=None):
-        """
-        Load the data table and corresponding validation schema.
-
-        Parameters
-        ----------
-        dtype_conversion : dict
-            Column names as keys and corresponding type for loading the data.
-            Please take a look at the `pandas documentation
-            <https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
-            for detailed explanations.
-
-        """
-        if dtype_conversion is None:
-            dtype_conversion = {"growth": str}
-        super(GrowthExperiment, self).load(dtype_conversion=dtype_conversion)
-        self.data["growth"] = self.data["growth"].isin(self.TRUTHY)
+    def validate(self, model):
+        """Use a defined schema to validate the growth table format."""
+        GrowthExperimentModel.validate(self.data, lazy=True)
 
     def evaluate(self, model):
         """Evaluate in silico growth rates."""
diff --git a/src/memote/experimental/medium.py b/src/memote/experimental/medium.py
index d162d9edd..1a6551598 100644
--- a/src/memote/experimental/medium.py
+++ b/src/memote/experimental/medium.py
@@ -20,6 +20,10 @@
 from __future__ import absolute_import
 
 import logging
+from typing import Optional
+
+import pandera as pa
+from pandera.typing import Series
 
 from memote.experimental.experimental_base import ExperimentalBase
 
@@ -29,11 +33,34 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class MediumModel(pa.DataFrameModel):
+    exchange: Series[str] = pa.Field(
+        description="The exchange reaction identifiers must correspond to the "
+        "metabolic model identifiers.",
+        title="Exchange Reaction Identifier",
+        unique=True,
+    )
+    uptake: Series[float] = pa.Field(
+        ge=0.0,
+        le=1000.0,
+        title="Uptake Rate",
+        description="The uptake rate for the exchange reaction. For models following "
+        "common practices this modifies the lower bound.",
+    )
+    comment: Optional[Series[str]] = pa.Field(
+        nullable=True,
+        title="Comment",
+        description="Optional comment which is not processed further.",
+    )
+
+    class Config:
+        coerce = True
+        strict = "filter"
+
+
 class Medium(ExperimentalBase):
     """Represent a specific medium condition."""
 
-    SCHEMA = "medium.json"
-
     def __init__(self, **kwargs):
         """
         Initialize a medium.
@@ -45,19 +72,10 @@ def __init__(self, **kwargs):
         """
         super(Medium, self).__init__(**kwargs)
 
-    def validate(self, model, checks=None):
+    def validate(self, model):
         """Use a defined schema to validate the medium table format."""
-        if checks is None:
-            checks = []
-        custom = [
-            {
-                "unknown-identifier": {
-                    "column": "exchange",
-                    "identifiers": {r.id for r in model.reactions},
-                }
-            }
-        ]
-        super(Medium, self).validate(model=model, checks=checks + custom)
+        MediumModel.validate(self.data, lazy=True)
+        assert self.data["exchange"].isin({r.id for r in model.reactions}).all()
 
     def apply(self, model):
         """Set the defined medium on the given model."""
diff --git a/src/memote/experimental/schemata/essentiality.json b/src/memote/experimental/schemata/essentiality.json
deleted file mode 100644
index ba5fb8cd3..000000000
--- a/src/memote/experimental/schemata/essentiality.json
+++ /dev/null
@@ -1,47 +0,0 @@
-{
-  "fields": [
-    {
-      "name": "gene",
-      "title": "Gene Identifier",
-      "description": "The gene identifier must correspond to the metabolic model identifiers.",
-      "type": "string",
-      "constraints": {
-        "required": true,
-        "unique": true
-      }
-    },
-    {
-      "name": "essential",
-      "title": "Gene Essentiality",
-      "description": "Whether or not a gene is (conditionally) essential.",
-      "type": "boolean",
-      "constraints": {
-        "required": true
-      },
-      "trueValues": [
-        "true",
-        "True",
-        "TRUE",
-        "1",
-        "yes",
-        "Yes",
-        "YES"
-      ],
-      "falseValues": [
-        "false",
-        "False",
-        "FALSE",
-        "0",
-        "no",
-        "No",
-        "NO"
-      ]
-    },
-    {
-      "name": "comment",
-      "title": "Comment",
-      "description": "Optional comment which is not processed further.",
-      "type": "string"
-    }
-  ]
-}
diff --git a/src/memote/experimental/schemata/growth.json b/src/memote/experimental/schemata/growth.json
deleted file mode 100644
index 5634a0fbc..000000000
--- a/src/memote/experimental/schemata/growth.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "fields": [
-    {
-      "name": "exchange",
-      "title": "Exchange Reaction Identifier",
-      "description": "The exchange reaction identifier of the variable medium component. Typically, this is a carbon source which will be added to a configured base medium.",
-      "type": "string",
-      "constraints": {
-        "required": true
-      }
-    },
-    {
-      "name": "uptake",
-      "title": "Uptake Rate",
-      "description": "The uptake rate for the exchange reaction. For models following common practices this modifies the lower bound.",
-      "type": "number",
-      "constraints": {
-        "required": true,
-        "minimum": 0,
-        "maximum": 1000
-      }
-    },
-    {
-      "name": "growth",
-      "title": "Growth",
-      "description": "A binary indicator whether growth was observed according to the processed biolog data.",
-      "type": "boolean",
-      "constraints": {
-        "required": true
-      },
-      "trueValues": [
-        "true",
-        "True",
-        "TRUE",
-        "1",
-        "yes",
-        "Yes",
-        "YES"
-      ],
-      "falseValues": [
-        "false",
-        "False",
-        "FALSE",
-        "0",
-        "no",
-        "No",
-        "NO"
-      ]
-    },
-    {
-      "name": "comment",
-      "title": "Comment",
-      "description": "Optional comment which is not processed further.",
-      "type": "string"
-    }
-  ]
-}
diff --git a/src/memote/experimental/schemata/medium.json b/src/memote/experimental/schemata/medium.json
deleted file mode 100644
index 8cac2c357..000000000
--- a/src/memote/experimental/schemata/medium.json
+++ /dev/null
@@ -1,30 +0,0 @@
-{
-  "fields": [
-    {
-      "name": "exchange",
-      "title": "Exchange Reaction Identifier",
-      "description": "The exchange reaction identifiers must correspond to the metabolic model identifiers.",
-      "type": "string",
-      "constraints": {
-        "required": true
-      }
-    },
-    {
-      "name": "uptake",
-      "title": "Uptake Rate",
-      "description": "The uptake rate for the exchange reaction. For models following common practices this modifies the lower bound.",
-      "type": "number",
-      "constraints": {
-        "required": true,
-        "minimum": 0,
-        "maximum": 1000
-      }
-    },
-    {
-      "name": "comment",
-      "title": "Comment",
-      "description": "Optional comment which is not processed further.",
-      "type": "string"
-    }
-  ]
-}
diff --git a/src/memote/experimental/schemata/production.json b/src/memote/experimental/schemata/production.json
deleted file mode 100644
index a8b63b317..000000000
--- a/src/memote/experimental/schemata/production.json
+++ /dev/null
@@ -1,46 +0,0 @@
-{
-  "fields": [
-    {
-      "name": "compound",
-      "title": "Production Compound Identifier",
-      "description": "The product identifier must correspond with the metabolic model identifiers.",
-      "type": "string",
-      "constraints": {
-        "required": true
-      }
-    },
-    {
-      "name": "production",
-      "title": "production Rate",
-      "description": "The measured production rate of the strain.",
-      "type": "number",
-      "constraints": {
-        "required": true,
-        "minimum": 0
-      }
-    },
-    {
-      "name": "growth",
-      "title": "Growth Rate",
-      "description": "The measured growth rate of the production strain.",
-      "type": "number",
-      "constraints": {
-        "required": true,
-        "minimum": 0,
-        "maximum": 10
-      }
-    },
-    {
-      "name": "medium",
-      "title": "Medium Composition",
-      "description": "The filename of the medium composition used if any.",
-      "type": "string"
-    },
-    {
-      "name": "comment",
-      "title": "Comment",
-      "description": "Optional comment which is not processed further.",
-      "type": "string"
-    }
-  ]
-}
diff --git a/src/memote/experimental/tabular.py b/src/memote/experimental/tabular.py
index 985d186f9..04fbdd873 100644
--- a/src/memote/experimental/tabular.py
+++ b/src/memote/experimental/tabular.py
@@ -22,7 +22,12 @@
 import pandas as pd
 
 
-def read_tabular(filename, dtype_conversion=None):
+def read_tabular(
+    filename,
+    dtype_conversion=None,
+    truthy=("yes", "Yes", "YES"),
+    falsy=("no", "No", "NO"),
+):
     """
     Read a tabular data file which can be CSV, TSV, XLS or XLSX.
 
@@ -49,15 +54,45 @@ def read_tabular(filename, dtype_conversion=None):
     # Completely empty columns are interpreted as float by default.
     dtype_conversion["comment"] = str
     if "csv" in ext:
-        df = pd.read_csv(filename, dtype=dtype_conversion, encoding="utf-8")
+        df = pd.read_csv(
+            filename,
+            dtype=dtype_conversion,
+            true_values=list(truthy),
+            false_values=list(falsy),
+            encoding="utf-8",
+        )
     elif "tsv" in ext:
-        df = pd.read_table(filename, sep="\t", dtype=dtype_conversion, encoding="utf-8")
+        df = pd.read_table(
+            filename,
+            sep="\t",
+            dtype=dtype_conversion,
+            true_values=list(truthy),
+            false_values=list(falsy),
+            encoding="utf-8",
+        )
     elif "xlsx" in ext:
-        df = pd.read_excel(filename, dtype=dtype_conversion, engine="openpyxl")
+        df = pd.read_excel(
+            filename,
+            dtype=dtype_conversion,
+            true_values=list(truthy),
+            false_values=list(falsy),
+            engine="openpyxl",
+        )
     elif "xls" in ext:
-        df = pd.read_excel(filename, dtype=dtype_conversion, engine="xlrd")
+        df = pd.read_excel(
+            filename,
+            dtype=dtype_conversion,
+            true_values=list(truthy),
+            false_values=list(falsy),
+            engine="xlrd",
+        )
     elif "ods" in ext:
-        df = pd.read_excel(filename, dtype=dtype_conversion)
+        df = pd.read_excel(
+            filename,
+            dtype=dtype_conversion,
+            true_values=list(truthy),
+            false_values=list(falsy),
+        )
     else:
         raise ValueError("Unknown file format '{}'.".format(ext))
     return df