From f52ba0257cad2a139e10eac73497a2eb9b5ae7bc Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Mon, 11 Mar 2024 18:46:23 +0100
Subject: [PATCH] Improve docs about data model for predictions (#163)

Closes #119
---
 docs/source/usage.rst                 |  2 +
 scripts/README.md                     | 11 +++++
 src/biomappings/resources/__init__.py | 61 +++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)
 create mode 100644 scripts/README.md

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 2f4235fb..fd147244 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -5,3 +5,5 @@ Reference
 .. automodapi:: biomappings.utils
 
 .. automodapi:: biomappings.testing
+
+.. automodapi:: biomappings.resources
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..ffbd1a36
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,11 @@
+# Scripts
+
+This folder houses scripts that can be used to generate predicted mappings, typically
+through a lexical mapping workflow.
+
+Most of the lexical mappings in Biomappings were generated with a workflow that wraps Gilda and PyOBO.
+However, Biomappings is generic to any workflow that generates predictions, such as those
+coming from knowledge graph embedding models. More information can be found about the helper functions
+for writing your own prediction generation workflow can be found
+at https://biomappings.readthedocs.io/en/latest/usage.html. This also has a summary of the data types that
+correspond to rows in the mappings (`MappingTuple`) and predictions files (`PredictionTuple`).
diff --git a/src/biomappings/resources/__init__.py b/src/biomappings/resources/__init__.py
index 685ae124..179bf00e 100644
--- a/src/biomappings/resources/__init__.py
+++ b/src/biomappings/resources/__init__.py
@@ -27,6 +27,34 @@
 
 from biomappings.utils import OVERRIDE_MIRIAM, RESOURCE_PATH, get_canonical_tuple
 
+__all__ = [
+    "MappingTuple",
+    "MAPPINGS_HEADER",
+    "PredictionTuple",
+    "PREDICTIONS_HEADER",
+    "Mappings",
+    "load_mappings",
+    "load_mappings_subset",
+    "append_true_mappings",
+    "append_true_mapping_tuples",
+    "write_true_mappings",
+    "load_false_mappings",
+    "append_false_mappings",
+    "write_false_mappings",
+    "load_unsure",
+    "append_unsure_mappings",
+    "write_unsure_mappings",
+    "load_predictions",
+    "append_predictions",
+    "append_prediction_tuples",
+    "write_predictions",
+    "remove_mappings",
+    "load_curators",
+    "filter_predictions",
+    "get_curated_filter",
+    "prediction_tuples_from_semra",
+]
+
 logger = logging.getLogger(__name__)
 
 MAPPINGS_HEADER = [
@@ -110,8 +138,41 @@ class PredictionTuple(NamedTuple):
     target_identifier: str
     target_name: str
     type: str
+    """A `semapv <https://bioregistry.io/registry/semapv>`_ term describing the mapping type.
+
+    These are relatively high level, and can be any child of ``semapv:Matching``, including:
+
+    1. ``semapv:LexicalMatching``
+    2. ``semapv:LogicalReasoning``
+    """
     confidence: float
+    """An assessment of the confidence of the mapping, reported by the method used to generate it.
+
+    This means that confidence values aren't generally comparable, though they should follow
+    the rough standard that closer to 1 is more confident and closer to 0 is less confident.
+
+    Most of the lexical mappings already in Biomappings were generated with Gilda.
+    Depending on the script, the score therefore refers to either:
+
+    1. The Gilda match score, inspired by https://aclanthology.org/W15-3801/. Section 5.2 of the
+       `supplementary material for the Gilda paper <https://doi.org/10.1093/bioadv/vbac034>`_
+       describes this score in detail, where 1.0 is best and 0 is worst.
+       https://github.com/biopragmatics/biomappings/blob/master/scripts/generate_agrovoc_mappings.py
+       is an example that uses this variant.
+    2. A high-level estimation of the precision of the scores generated by the given script.
+       For example, the CL-MeSH mappings were estimated to be 90% correct, so all the mappings
+       generated by https://github.com/biopragmatics/biomappings/blob/master/scripts/generate_cl_mesh_mappings.py
+       are marked with 0.9 as its score.
+
+    However, other variants are possible. For example, this confidence could reflect the loss function
+    if a knowledge graph embedding model was used ot generate a mapping orediction.
+    """
     source: str
+    """The script or process that generated this mapping.
+
+    Most of these scripts are in https://github.com/biopragmatics/biomappings/tree/master/scripts,
+    or can be based off of them.
+    """
 
     def as_dict(self) -> Mapping[str, Any]:
         """Get the prediction tuple as a dictionary."""