Skip to content

Commit

Permalink
Improve docs about data model for predictions (#163)
Browse files Browse the repository at this point in the history
Closes #119
  • Loading branch information
cthoyt authored Mar 11, 2024
1 parent 008cba0 commit f52ba02
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ Reference
.. automodapi:: biomappings.utils

.. automodapi:: biomappings.testing

.. automodapi:: biomappings.resources
11 changes: 11 additions & 0 deletions scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Scripts

This folder houses scripts that can be used to generate predicted mappings, typically
through a lexical mapping workflow.

Most of the lexical mappings in Biomappings were generated with a workflow that wraps Gilda and PyOBO.
However, Biomappings is generic to any workflow that generates predictions, such as those
coming from knowledge graph embedding models. More information can be found about the helper functions
for writing your own prediction generation workflow can be found
at https://biomappings.readthedocs.io/en/latest/usage.html. This also has a summary of the data types that
correspond to rows in the mappings (`MappingTuple`) and predictions files (`PredictionTuple`).
61 changes: 61 additions & 0 deletions src/biomappings/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,34 @@

from biomappings.utils import OVERRIDE_MIRIAM, RESOURCE_PATH, get_canonical_tuple

__all__ = [
"MappingTuple",
"MAPPINGS_HEADER",
"PredictionTuple",
"PREDICTIONS_HEADER",
"Mappings",
"load_mappings",
"load_mappings_subset",
"append_true_mappings",
"append_true_mapping_tuples",
"write_true_mappings",
"load_false_mappings",
"append_false_mappings",
"write_false_mappings",
"load_unsure",
"append_unsure_mappings",
"write_unsure_mappings",
"load_predictions",
"append_predictions",
"append_prediction_tuples",
"write_predictions",
"remove_mappings",
"load_curators",
"filter_predictions",
"get_curated_filter",
"prediction_tuples_from_semra",
]

logger = logging.getLogger(__name__)

MAPPINGS_HEADER = [
Expand Down Expand Up @@ -110,8 +138,41 @@ class PredictionTuple(NamedTuple):
target_identifier: str
target_name: str
type: str
"""A `semapv <https://bioregistry.io/registry/semapv>`_ term describing the mapping type.
These are relatively high level, and can be any child of ``semapv:Matching``, including:
1. ``semapv:LexicalMatching``
2. ``semapv:LogicalReasoning``
"""
confidence: float
"""An assessment of the confidence of the mapping, reported by the method used to generate it.
This means that confidence values aren't generally comparable, though they should follow
the rough standard that closer to 1 is more confident and closer to 0 is less confident.
Most of the lexical mappings already in Biomappings were generated with Gilda.
Depending on the script, the score therefore refers to either:
1. The Gilda match score, inspired by https://aclanthology.org/W15-3801/. Section 5.2 of the
`supplementary material for the Gilda paper <https://doi.org/10.1093/bioadv/vbac034>`_
describes this score in detail, where 1.0 is best and 0 is worst.
https://github.com/biopragmatics/biomappings/blob/master/scripts/generate_agrovoc_mappings.py
is an example that uses this variant.
2. A high-level estimation of the precision of the scores generated by the given script.
For example, the CL-MeSH mappings were estimated to be 90% correct, so all the mappings
generated by https://github.com/biopragmatics/biomappings/blob/master/scripts/generate_cl_mesh_mappings.py
are marked with 0.9 as its score.
However, other variants are possible. For example, this confidence could reflect the loss function
if a knowledge graph embedding model was used ot generate a mapping orediction.
"""
source: str
"""The script or process that generated this mapping.
Most of these scripts are in https://github.com/biopragmatics/biomappings/tree/master/scripts,
or can be based off of them.
"""

def as_dict(self) -> Mapping[str, Any]:
"""Get the prediction tuple as a dictionary."""
Expand Down

0 comments on commit f52ba02

Please sign in to comment.