Make tests reusable

This is helpful in case you curate your own mappings somewhere else and still want to have a test case for them being internally right
biopragmatics · Aug 14, 2023 · 483feba · 483feba
1 parent f74c651
commit 483feba
Show file tree

Hide file tree

Showing 2 changed files with 197 additions and 185 deletions.
diff --git a/src/biomappings/testing.py b/src/biomappings/testing.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+
+"""Validation tests for :mod:`biomappings`."""
+
+import itertools as itt
+import unittest
+from collections import defaultdict
+
+import bioregistry
+
+from biomappings.resources import (
+    Mappings,
+    MappingTuple,
+    PredictionTuple,
+    load_curators,
+    mapping_sort_key,
+)
+from biomappings.resources.semapv import get_semapv
+from biomappings.utils import (
+    InvalidIdentifierPattern,
+    InvalidNormIdentifier,
+    check_valid_prefix_id,
+    get_canonical_tuple,
+)
+
+semapv = get_semapv()
+
+
+def _extract_redundant(counter):
+    return [(key, values) for key, values in counter.items() if len(values) > 1]
+
+
+def _locations_str(locations):
+    return ", ".join(f"{label}:{line}" for label, line in locations)
+
+
+class IntegrityTestCase(unittest.TestCase):
+    """Data integrity tests."""
+
+    mappings: Mappings
+    predictions: Mappings
+    incorrect: Mappings
+    unsure: Mappings
+
+    def _iter_groups(self):
+        for group, label in [
+            (self.mappings, "positive"),
+            (self.incorrect, "negative"),
+            (self.predictions, "predictions"),
+            (self.unsure, "unsure"),
+        ]:
+            for i, mapping in enumerate(group, start=2):
+                yield label, i, mapping
+
+    def test_prediction_types(self):
+        """Test that the prediction type is pulled in properly."""
+        for line, mapping in enumerate(self.mappings, start=2):
+            pt = mapping.get("prediction_type", "".strip())
+            if not pt:
+                continue
+            self.assertTrue(
+                pt.startswith("semapv:"),
+                msg=f"Prediction type should be annotated with semapv on line {line}",
+            )
+            self.assertIn(pt[len("semapv:") :], semapv)
+            self.assertNotEqual(
+                "semapv:ManualMappingCuration",
+                pt,
+                msg="Prediction can not be annotated with manual curation",
+            )
+
+        for label, line, mapping in self._iter_groups():
+            tt = mapping["type"]
+            self.assertTrue(
+                tt.startswith("semapv:"),
+                msg=f"[{label}] The 'type' column should be annotated with semapv on line {line}",
+            )
+            self.assertIn(tt[len("semapv:") :], semapv)
+
+    def test_canonical_prefixes(self):
+        """Test that all mappings use canonical bioregistry prefixes."""
+        valid_prefixes = set(bioregistry.read_registry())
+        for label, line, mapping in self._iter_groups():
+            source_prefix, target_prefix = mapping["source prefix"], mapping["target prefix"]
+            self.assertIn(
+                source_prefix,
+                valid_prefixes,
+                msg=f"Invalid prefix: {source_prefix} on {label}:{line}",
+            )
+            self.assertIn(
+                target_prefix,
+                valid_prefixes,
+                msg=f"Invalid prefix: {target_prefix} on {label}:{line}",
+            )
+
+    def test_normalized_identifiers(self):
+        """Test that all identifiers have been normalized (based on bioregistry definition)."""
+        for label, line, mapping in self._iter_groups():
+            self.assert_canonical_identifier(
+                mapping["source prefix"], mapping["source identifier"], label, line
+            )
+            self.assert_canonical_identifier(
+                mapping["target prefix"], mapping["target identifier"], label, line
+            )
+
+    def assert_canonical_identifier(
+        self, prefix: str, identifier: str, label: str, line: int
+    ) -> None:
+        """Assert a given identifier is canonical.
+
+        :param prefix: The prefix to check
+        :param identifier: The identifier in the semantic space for the prefix
+        :param label: The label of the mapping file
+        :param line: The line number of the mapping
+        """
+        try:
+            check_valid_prefix_id(prefix, identifier)
+        except InvalidNormIdentifier as e:
+            self.fail(f"[{label}:{line}] {e}")
+        except InvalidIdentifierPattern as e:
+            self.fail(f"[{label}:{line}] {e}")
+
+    def test_contributors(self):
+        """Test all contributors have an entry in the curators.tsv file."""
+        contributor_orcids = {row["orcid"] for row in load_curators()}
+        for mapping in itt.chain(self.mappings, self.incorrect, self.unsure):
+            source = mapping["source"]
+            if not source.startswith("orcid:"):
+                continue
+            self.assertIn(source[len("orcid:") :], contributor_orcids)
+
+    def test_cross_redundancy(self):
+        """Test the redundancy of manually curated mappings and predicted mappings."""
+        counter = defaultdict(lambda: defaultdict(list))
+        for label, line, mapping in self._iter_groups():
+            counter[get_canonical_tuple(mapping)][label].append(line)
+
+        redundant = []
+        for mapping, label_to_lines in counter.items():
+            if len(label_to_lines) <= 1:
+                continue
+            redundant.append((mapping, sorted(label_to_lines.items())))
+
+        if redundant:
+            msg = "".join(
+                f"\n  {mapping}: {_locations_str(locations)}" for mapping, locations in redundant
+            )
+            raise ValueError(f"{len(redundant)} are redundant: {msg}")
+
+    def assert_no_internal_redundancies(self, m: Mappings, tuple_cls):
+        """Assert that the list of mappings doesn't have any redundancies."""
+        counter = defaultdict(list)
+        for line, mapping in enumerate(m, start=1):
+            counter[tuple_cls.from_dict(mapping)].append(line)
+        redundant = _extract_redundant(counter)
+        if redundant:
+            msg = "".join(
+                f"\n  {mapping.source_curie}/{mapping.target_curie}: {locations}"
+                for mapping, locations in redundant
+            )
+            raise ValueError(f"{len(redundant)} are redundant: {msg}")
+
+    def test_predictions_sorted(self):
+        """Test the predictions are in a canonical order."""
+        assert self.predictions == sorted(  # noqa:S101
+            self.predictions, key=mapping_sort_key
+        ), "Predictions are not sorted"
+        self.assert_no_internal_redundancies(self.predictions, PredictionTuple)
+
+    def test_curations_sorted(self):
+        """Test the true curated mappings are in a canonical order."""
+        assert self.mappings == sorted(  # noqa:S101
+            self.mappings, key=mapping_sort_key
+        ), "True curations are not sorted"
+        self.assert_no_internal_redundancies(self.mappings, MappingTuple)
+
+    def test_false_mappings_sorted(self):
+        """Test the false curated mappings are in a canonical order."""
+        assert self.incorrect == sorted(  # noqa:S101
+            self.incorrect, key=mapping_sort_key
+        ), "False curations are not sorted"
+        self.assert_no_internal_redundancies(self.incorrect, MappingTuple)
+
+    def test_unsure_sorted(self):
+        """Test the unsure mappings are in a canonical order."""
+        assert self.unsure == sorted(  # noqa:S101
+            self.unsure, key=mapping_sort_key
+        ), "Unsure curations are not sorted"
+        self.assert_no_internal_redundancies(self.unsure, MappingTuple)
diff --git a/tests/test_validity.py b/tests/test_validity.py
@@ -2,198 +2,21 @@
 
 """Validation tests for :mod:`biomappings`."""
 
-import itertools as itt
-import unittest
-from collections import defaultdict
-
-import bioregistry
-
 from biomappings import (
     load_false_mappings,
     load_mappings,
     load_predictions,
     load_unsure,
-)
-from biomappings.resources import (
-    MappingTuple,
-    PredictionTuple,
-    load_curators,
-    mapping_sort_key,
-)
-from biomappings.resources.semapv import get_semapv
-from biomappings.utils import (
-    InvalidIdentifierPattern,
-    InvalidNormIdentifier,
-    check_valid_prefix_id,
-    get_canonical_tuple,
+    testing,
 )
 
-mappings = load_mappings()
-predictions = load_predictions()
-incorrect = load_false_mappings()
-unsure = load_unsure()
-semapv = get_semapv()
 
-
-def _iter_groups():
-    for group, label in [
-        (mappings, "positive"),
-        (incorrect, "negative"),
-        (predictions, "predictions"),
-        (unsure, "unsure"),
-    ]:
-        for i, mapping in enumerate(group, start=2):
-            yield label, i, mapping
-
-
-class TestIntegrity(unittest.TestCase):
+class TestIntegrity(testing.IntegrityTestCase):
     """Data integrity tests."""
 
-    def test_prediction_types(self):
-        """Test that the prediction type is pulled in properly."""
-        for line, mapping in enumerate(mappings, start=2):
-            pt = mapping.get("prediction_type", "".strip())
-            if not pt:
-                continue
-            self.assertTrue(
-                pt.startswith("semapv:"),
-                msg=f"Prediction type should be annotated with semapv on line {line}",
-            )
-            self.assertIn(pt[len("semapv:") :], semapv)
-            self.assertNotEqual(
-                "semapv:ManualMappingCuration",
-                pt,
-                msg="Prediction can not be annotated with manual curation",
-            )
-
-        for label, line, mapping in _iter_groups():
-            tt = mapping["type"]
-            self.assertTrue(
-                tt.startswith("semapv:"),
-                msg=f"[{label}] The 'type' column should be annotated with semapv on line {line}",
-            )
-            self.assertIn(tt[len("semapv:") :], semapv)
-
-    def test_canonical_prefixes(self):
-        """Test that all mappings use canonical bioregistry prefixes."""
-        valid_prefixes = set(bioregistry.read_registry())
-        for label, line, mapping in _iter_groups():
-            source_prefix, target_prefix = mapping["source prefix"], mapping["target prefix"]
-            self.assertIn(
-                source_prefix,
-                valid_prefixes,
-                msg=f"Invalid prefix: {source_prefix} on {label}:{line}",
-            )
-            self.assertIn(
-                target_prefix,
-                valid_prefixes,
-                msg=f"Invalid prefix: {target_prefix} on {label}:{line}",
-            )
-
-    def test_normalized_identifiers(self):
-        """Test that all identifiers have been normalized (based on bioregistry definition)."""
-        for label, line, mapping in _iter_groups():
-            self.assert_canonical_identifier(
-                mapping["source prefix"], mapping["source identifier"], label, line
-            )
-            self.assert_canonical_identifier(
-                mapping["target prefix"], mapping["target identifier"], label, line
-            )
-
-    def assert_canonical_identifier(
-        self, prefix: str, identifier: str, label: str, line: int
-    ) -> None:
-        """Assert a given identifier is canonical.
-
-        :param prefix: The prefix to check
-        :param identifier: The identifier in the semantic space for the prefix
-        :param label: The label of the mapping file
-        :param line: The line number of the mapping
-        """
-        try:
-            check_valid_prefix_id(prefix, identifier)
-        except InvalidNormIdentifier as e:
-            self.fail(f"[{label}:{line}] {e}")
-        except InvalidIdentifierPattern as e:
-            self.fail(f"[{label}:{line}] {e}")
-
-    def test_contributors(self):
-        """Test all contributors have an entry in the curators.tsv file."""
-        contributor_orcids = {row["orcid"] for row in load_curators()}
-        for mapping in itt.chain(mappings, incorrect, unsure):
-            source = mapping["source"]
-            if not source.startswith("orcid:"):
-                continue
-            self.assertIn(source[len("orcid:") :], contributor_orcids)
-
-
-def _extract_redundant(counter):
-    return [(key, values) for key, values in counter.items() if len(values) > 1]
-
-
-def test_cross_redundancy():
-    """Test the redundancy of manually curated mappings and predicted mappings."""
-    counter = defaultdict(lambda: defaultdict(list))
-    for label, line, mapping in _iter_groups():
-        counter[get_canonical_tuple(mapping)][label].append(line)
-
-    redundant = []
-    for mapping, label_to_lines in counter.items():
-        if len(label_to_lines) <= 1:
-            continue
-        redundant.append((mapping, sorted(label_to_lines.items())))
-
-    if redundant:
-        msg = "".join(
-            f"\n  {mapping}: {_locations_str(locations)}" for mapping, locations in redundant
-        )
-        raise ValueError(f"{len(redundant)} are redundant: {msg}")
-
-
-def _locations_str(locations):
-    return ", ".join(f"{label}:{line}" for label, line in locations)
-
-
-def _assert_no_internal_redundancies(m, tuple_cls):
-    counter = defaultdict(list)
-    for line, mapping in enumerate(m, start=1):
-        counter[tuple_cls.from_dict(mapping)].append(line)
-    redundant = _extract_redundant(counter)
-    if redundant:
-        msg = "".join(
-            f"\n  {mapping.source_curie}/{mapping.target_curie}: {locations}"
-            for mapping, locations in redundant
-        )
-        raise ValueError(f"{len(redundant)} are redundant: {msg}")
-
-
-def test_predictions_sorted():
-    """Test the predictions are in a canonical order."""
-    assert predictions == sorted(  # noqa:S101
-        predictions, key=mapping_sort_key
-    ), "Predictions are not sorted"
-    _assert_no_internal_redundancies(predictions, PredictionTuple)
-
-
-def test_curations_sorted():
-    """Test the true curated mappings are in a canonical order."""
-    assert mappings == sorted(  # noqa:S101
-        mappings, key=mapping_sort_key
-    ), "True curations are not sorted"
-    _assert_no_internal_redundancies(mappings, MappingTuple)
-
-
-def test_false_mappings_sorted():
-    """Test the false curated mappings are in a canonical order."""
-    assert incorrect == sorted(  # noqa:S101
-        incorrect, key=mapping_sort_key
-    ), "False curations are not sorted"
-    _assert_no_internal_redundancies(incorrect, MappingTuple)
-
-
-def test_unsure_sorted():
-    """Test the unsure mappings are in a canonical order."""
-    assert unsure == sorted(  # noqa:S101
-        unsure, key=mapping_sort_key
-    ), "Unsure curations are not sorted"
-    _assert_no_internal_redundancies(unsure, MappingTuple)
+    def setUp(self) -> None:
+        """Set up the test case."""
+        self.mappings = load_mappings()
+        self.predictions = load_predictions()
+        self.incorrect = load_false_mappings()
+        self.unsure = load_unsure()