From f52ba0257cad2a139e10eac73497a2eb9b5ae7bc Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 11 Mar 2024 18:46:23 +0100 Subject: [PATCH] Improve docs about data model for predictions (#163) Closes #119 --- docs/source/usage.rst | 2 + scripts/README.md | 11 +++++ src/biomappings/resources/__init__.py | 61 +++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 scripts/README.md diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 2f4235fb..fd147244 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -5,3 +5,5 @@ Reference .. automodapi:: biomappings.utils .. automodapi:: biomappings.testing + +.. automodapi:: biomappings.resources diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..ffbd1a36 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,11 @@ +# Scripts + +This folder houses scripts that can be used to generate predicted mappings, typically +through a lexical mapping workflow. + +Most of the lexical mappings in Biomappings were generated with a workflow that wraps Gilda and PyOBO. +However, Biomappings is generic to any workflow that generates predictions, such as those +coming from knowledge graph embedding models. More information can be found about the helper functions +for writing your own prediction generation workflow can be found +at https://biomappings.readthedocs.io/en/latest/usage.html. This also has a summary of the data types that +correspond to rows in the mappings (`MappingTuple`) and predictions files (`PredictionTuple`). diff --git a/src/biomappings/resources/__init__.py b/src/biomappings/resources/__init__.py index 685ae124..179bf00e 100644 --- a/src/biomappings/resources/__init__.py +++ b/src/biomappings/resources/__init__.py @@ -27,6 +27,34 @@ from biomappings.utils import OVERRIDE_MIRIAM, RESOURCE_PATH, get_canonical_tuple +__all__ = [ + "MappingTuple", + "MAPPINGS_HEADER", + "PredictionTuple", + "PREDICTIONS_HEADER", + "Mappings", + "load_mappings", + "load_mappings_subset", + "append_true_mappings", + "append_true_mapping_tuples", + "write_true_mappings", + "load_false_mappings", + "append_false_mappings", + "write_false_mappings", + "load_unsure", + "append_unsure_mappings", + "write_unsure_mappings", + "load_predictions", + "append_predictions", + "append_prediction_tuples", + "write_predictions", + "remove_mappings", + "load_curators", + "filter_predictions", + "get_curated_filter", + "prediction_tuples_from_semra", +] + logger = logging.getLogger(__name__) MAPPINGS_HEADER = [ @@ -110,8 +138,41 @@ class PredictionTuple(NamedTuple): target_identifier: str target_name: str type: str + """A `semapv `_ term describing the mapping type. + + These are relatively high level, and can be any child of ``semapv:Matching``, including: + + 1. ``semapv:LexicalMatching`` + 2. ``semapv:LogicalReasoning`` + """ confidence: float + """An assessment of the confidence of the mapping, reported by the method used to generate it. + + This means that confidence values aren't generally comparable, though they should follow + the rough standard that closer to 1 is more confident and closer to 0 is less confident. + + Most of the lexical mappings already in Biomappings were generated with Gilda. + Depending on the script, the score therefore refers to either: + + 1. The Gilda match score, inspired by https://aclanthology.org/W15-3801/. Section 5.2 of the + `supplementary material for the Gilda paper `_ + describes this score in detail, where 1.0 is best and 0 is worst. + https://github.com/biopragmatics/biomappings/blob/master/scripts/generate_agrovoc_mappings.py + is an example that uses this variant. + 2. A high-level estimation of the precision of the scores generated by the given script. + For example, the CL-MeSH mappings were estimated to be 90% correct, so all the mappings + generated by https://github.com/biopragmatics/biomappings/blob/master/scripts/generate_cl_mesh_mappings.py + are marked with 0.9 as its score. + + However, other variants are possible. For example, this confidence could reflect the loss function + if a knowledge graph embedding model was used ot generate a mapping orediction. + """ source: str + """The script or process that generated this mapping. + + Most of these scripts are in https://github.com/biopragmatics/biomappings/tree/master/scripts, + or can be based off of them. + """ def as_dict(self) -> Mapping[str, Any]: """Get the prediction tuple as a dictionary."""