Merge pull request #140 from gyorilab/mesh_mapping

Improve MeSH mappings
gyorilab · Jul 18, 2024 · f6500c4 · f6500c4
2 parents 306e6c7 + 40d95cd
commit f6500c4
Show file tree

Hide file tree

Showing 8 changed files with 28,252 additions and 822 deletions.
diff --git a/benchmarks/fplx_evaluation.py b/benchmarks/fplx_evaluation.py
@@ -18,7 +18,7 @@
                       'Aminopeptidases': {'MESH': 'D000626'},
                       'NF-AT proteins': {'MESH': 'D050778'},
                       'LTbetaR': {'HGNC': '6718'},
-                      'RNAi': {'MESH': 'D034622', 'GO': 'GO:0016246'},
+                      'RNAi': {'MESH': 'D034622', 'GO': 'GO:0016441'},
                       'Chaetocin': {'CHEBI': 'CHEBI:68747'},
                       'BAY11-7082': {'CHEBI': 'CHEBI:85928'},
                       'Toll-like receptors': {'MESH': 'D051193'},
@@ -48,7 +48,7 @@
                       'integrin alpha': {'FPLX': 'ITGA'},
                       'DC': {'MESH': 'D003713'},
                       'BMD': {'MESH': 'D015519'},
-                      'angina': {'EFO': '0003913'}}
+                      'angina': {'MESH': 'D000787', 'EFO': '0003913'}}
 
 
 incorrect_assertions = {'IGF': {'HGNC': '5464'},

diff --git a/gilda/__init__.py b/gilda/__init__.py
@@ -1,4 +1,4 @@
-__version__ = '1.2.1'
+__version__ = '1.3.0'
 
 import logging
 

diff --git a/gilda/generate_terms.py b/gilda/generate_terms.py
@@ -565,13 +565,14 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
         map_to_ns = {'MESH', 'DOID'}
     terms = []
     db, db_id, name = prefix.upper(), entry['id'], entry['name']
+    entry_name = name
     # We first need to decide if we prioritize another name space
     xref_dict = {xr['namespace']: xr['id'] for xr in entry.get('xrefs', [])}
     # Handle MeSH mappings first
     auto_mesh_mapping = mesh_mappings_reverse.get((db, db_id))
     if auto_mesh_mapping and not ignore_mappings:
-        db, db_id, name = ('MESH', auto_mesh_mapping[0],
-                           auto_mesh_mapping[1])
+        db, db_id, entry_name = ('MESH', auto_mesh_mapping[0],
+                                 auto_mesh_mapping[1])
     elif 'MESH' in map_to_ns and ('MESH' in xref_dict or 'MSH' in xref_dict):
         mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
         # Since we currently only include regular MeSH terms (which start
@@ -583,7 +584,7 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
                 # Here we need to check if we further map the MeSH ID to
                 # another namespace
                 mesh_mapping = mesh_mappings.get(mesh_id)
-                db, db_id, name = mesh_mapping if \
+                db, db_id, entry_name = mesh_mapping if \
                     (mesh_mapping and (mesh_mapping[0]
                                        not in {'EFO', 'HP', 'DOID'})) \
                     else ('MESH', mesh_id, mesh_name)
@@ -601,15 +602,15 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
         # If we don't get a name here, it's likely because an entry is
         # obsolete so we don't do the mapping
         if doid_name:
-            db, db_id, name = 'DOID', doid, doid_name
+            db, db_id, entry_name = 'DOID', doid, doid_name
 
     # Add a term for the name first
     name_term = Term(
         norm_text=normalize(name),
         text=name,
         db=db,
         id=db_id,
-        entry_name=name,
+        entry_name=entry_name,
         status='name',
         source=prefix,
         source_db=prefix.upper() if db != prefix.upper() else None,
@@ -639,7 +640,7 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
             text=synonym,
             db=db,
             id=db_id,
-            entry_name=name,
+            entry_name=entry_name,
             status='synonym',
             source=prefix,
             source_db=prefix.upper() if db != prefix.upper() else None,
@@ -709,7 +710,6 @@ def get_all_terms():
     ]
     for generated_terms in generated_term_groups:
         terms += generated_terms
-
     terms = filter_out_duplicates(terms)
     return terms
 

diff --git a/gilda/resources/mesh_ambig_mappings.tsv b/gilda/resources/mesh_ambig_mappings.tsv
diff --git a/gilda/resources/mesh_mappings.tsv b/gilda/resources/mesh_mappings.tsv
diff --git a/gilda/term.py b/gilda/term.py
@@ -154,8 +154,14 @@ def get_identifiers_url(db, id):
         return f'https://identifiers.org/{curie}'
 
 
-def _term_key(term: Term) -> Tuple[str, str, str]:
-    return term.db, term.id, term.text
+def _term_key(term: Term) -> Tuple[str, str, str, str, str]:
+    # We include source_id and source_db to avoid losing
+    # potentially important links back to mapped source IDs
+    # but we have to make sure these are strings since otherwise
+    # they could be None which can't be sorted against strings
+    source_db = term.source_db or ''
+    source_id = term.source_id or ''
+    return term.db, term.id, source_db, source_id, term.text
 
 
 statuses = {'curated': 1, 'name': 2, 'synonym': 3, 'former_name': 4}

diff --git a/gilda/tests/test_grounder.py b/gilda/tests/test_grounder.py
@@ -56,7 +56,7 @@ def test_grounder_bug():
 
 def test_grounder_num_entries():
     entries = gr.lookup('NPM1')
-    assert len(entries) == 4, entries
+    assert len(entries) == 5, entries
     entries = gr.lookup('H4')
     assert len(entries) == 7, entries
 
@@ -202,7 +202,7 @@ def test_unidecode():
     for txt in txts:
         matches = gr.ground(txt)
         assert len(matches) == 2
-        assert {m.term.db for m in matches} == {'EFO', 'DOID'}
+        assert {m.term.db for m in matches} == {'EFO', 'MESH'}
 
     txts = ['Bi₇O₉I₃', 'Bi7O9I3']
     for txt in txts:

diff --git a/scripts/generate_mesh_mappings.py b/scripts/generate_mesh_mappings.py
@@ -1,4 +1,5 @@
 import os
+import pandas
 from collections import defaultdict
 from gilda.generate_terms import *
 from indra.databases import mesh_client
@@ -16,17 +17,57 @@ def is_chemical(mesh_id):
     return mesh_client.is_molecular(mesh_id)
 
 
-def render_row(me, te):
-    return '\t'.join([me.db, me.id, me.entry_name,
-                      te.db, te.id, te.entry_name])
+def load_biomappings():
+    """Load curated positive and negative mappings from Biomappings."""
+    url_base = ('https://raw.githubusercontent.com/biopragmatics/biomappings/'
+                'master/src/biomappings/resources/')
+    positive_df = pandas.read_csv(url_base + 'mappings.tsv', sep='\t')
+    negative_df = pandas.read_csv(url_base + 'incorrect.tsv', sep='\t')
+    positive_mappings = defaultdict(list)
+    negative_mappings = defaultdict(list)
+    # These are the only relevant prefixes, there are mappings to
+    # various other namespaces we don't need
+    prefixes = {'fplx', 'chebi', 'go', 'hp', 'doid', 'efo', 'hgnc'}
+    for mapping_df, mappings in ((positive_df, positive_mappings),
+                                 (negative_df, negative_mappings)):
+        for _, row in mapping_df.iterrows():
+            # We only need exact matches.
+            # TODO: should we consider non-exact matches to be effectively
+            # negative?
+            if row['relation'] != 'skos:exactMatch':
+                continue
+            # Look at both directions in which mesh mappings
+            # can appear
+            if row['source prefix'] == 'mesh':
+                mesh_id = row['source identifier']
+                other_ns = row['target prefix']
+                other_id = row['target identifier']
+            elif row['target prefix'] == 'mesh':
+                mesh_id = row['target identifier']
+                other_ns = row['source prefix']
+                other_id = row['source identifier']
+            else:
+                continue
+            if other_ns not in prefixes:
+                continue
+            # We make the namespace upper to be consistent
+            # with Gilda
+            mappings[mesh_id].append((other_ns.upper(), other_id))
+    return positive_mappings, negative_mappings
 
 
 def get_nonambiguous(maps):
-    # If there are more than one mappings from MESH
+    # If there is more than one mapping from MESH
     if len(maps) > 1:
         # We see if there are any name-level matches
         name_matches = [(me, te) for me, te in maps
-                        if me.entry_name.lower() == te.entry_name.lower()]
+                        if (me.entry_name.lower() if me.entry_name else '')
+                            == (te.entry_name.lower() if te.entry_name else '')
+                        # Corner case where we have multiple MeSH-based terms
+                        # due to an orignal term from e.g., DOID having been
+                        # mapped to MeSH
+                        and me.db != te.db]
+
         # If we still have ambiguity, we print to the user
         if not name_matches or len(name_matches) > 1:
             return None, maps
@@ -42,7 +83,7 @@ def resolve_duplicates(mappings):
     keep_mappings = []
     all_ambigs = []
     # First we deal with mappings from MESH
-    for maps in mappings.values():
+    for key, maps in mappings.items():
         maps_list = maps.values()
         keep, ambigs = get_nonambiguous(maps_list)
         if keep:
@@ -67,9 +108,17 @@ def resolve_duplicates(mappings):
 
 
 def dump_mappings(mappings, fname):
+    def render_row(me, te):
+        return '\t'.join([me.db, me.id, me.entry_name,
+                          te.db, te.id, te.entry_name])
+
     with open(fname, 'w') as fh:
         for mesh_term, other_term in sorted(mappings, key=lambda x: x[0].id):
-            fh.write(render_row(mesh_term, other_term) + '\n')
+            # Corner case where we have multiple MeSH-based terms
+            # due to an orignal term from e.g., DOID having been
+            # mapped to MeSH
+            if other_term.db != 'MESH':
+                fh.write(render_row(mesh_term, other_term) + '\n')
 
 
 def get_ambigs_by_db(ambigs):
@@ -83,9 +132,9 @@ def get_mesh_mappings(ambigs):
     mappings_by_mesh_id = defaultdict(dict)
     for text, ambig_terms in ambigs.items():
         ambigs_by_db = get_ambigs_by_db(ambig_terms)
-        print('Considering %s' % text)
-        for term in ambig_terms:
-            print('%s:%s %s' % (term.db, term.id, term.entry_name))
+        #print('Considering %s' % text)
+        #for term in ambig_terms:
+        #    print('%s:%s %s' % (term.db, term.id, term.entry_name))
         order = [('FPLX', is_protein),
                  ('HGNC', is_protein),
                  ('CHEBI', is_chemical),
@@ -98,11 +147,11 @@ def get_mesh_mappings(ambigs):
             if len(ambigs_by_db.get(ns, [])) == 1 and mesh_constraint(me.id):
                 mappings_by_mesh_id[me.id][(ambigs_by_db[ns][0].db,
                                             ambigs_by_db[ns][0].id)] = \
-                        (me, ambigs_by_db[ns][0])
-                print('Adding mapping for %s' % ns)
+                        [me, ambigs_by_db[ns][0]]
+                #print('Adding mapping for %s' % ns)
                 break
-        print('--------------')
-    return mappings_by_mesh_id
+        #print('--------------')
+    return dict(mappings_by_mesh_id)
 
 
 def find_ambiguities(terms, match_attr='text'):
@@ -112,13 +161,24 @@ def find_ambiguities(terms, match_attr='text'):
         # We consider it an ambiguity if the same text entry appears
         # multiple times
         ambig_entries[match_fun(term)].append(term)
+    # There is a corner case where the match_fun matches two different
+    # synonyms / variants of the same entry from the same database which
+    # are not really considered ambiguity but need to be reduced to a single
+    # entry to avoid being inadvertently filtered out later
+    ambig_entries = {
+        # Here, we make sure we only keep a single term with a given db and id
+        norm_term: list({(term.db, term.id): term for term in matching_terms}.values())
+        for norm_term, matching_terms in ambig_entries.items()
+    }
     # It's only an ambiguity if there are two entries at least
-    ambig_entries = {k: v for k, v in ambig_entries.items() if len(v) >= 2}
+    ambig_entries = {norm_term: matching_terms
+                     for norm_term, matching_terms
+                     in ambig_entries.items() if len(matching_terms) >= 2}
     # We filter out any ambiguities that contain not exactly one MeSH term
     ambig_entries = {k: v for k, v in ambig_entries.items()
                      if len([e for e in v if e.db == 'MESH']) == 1}
     print('Found a total of %d relevant ambiguities' % len(ambig_entries))
-    return ambig_entries
+    return dict(ambig_entries)
 
 
 def get_terms():
@@ -135,10 +195,7 @@ def get_terms():
     return terms
 
 
-def manual_go_mappings(terms):
-    td = defaultdict(list)
-    for term in terms:
-        td[(term.db, term.id)].append(term)
+def manual_go_mappings(terms_by_id_tuple):
     # Migrated from FamPlex and INDRA
     map = [
         ('D002465', 'GO:0048870'),
@@ -153,14 +210,29 @@ def manual_go_mappings(terms):
     ]
     mappings_by_mesh_id = defaultdict(dict)
     for mid, gid in map:
-        mt = td[('MESH', mid)][0]
-        gt = td[('GO', gid)][0]
+        mt = terms_by_id_tuple[('MESH', mid)]
+        gt = terms_by_id_tuple[('GO', gid)]
         mappings_by_mesh_id[mid][('GO', gid)] = (mt, gt)
-    return mappings_by_mesh_id
+    return dict(mappings_by_mesh_id)
 
 
 if __name__ == '__main__':
     terms = get_terms()
+    # We create a lookup of term objects by their db/id tuple
+    # for quick lookups. We also add source db/ids here
+    # because they can be relevant when finding terms for
+    # Biomappings curations. Note that when loading e.g.,
+    # DOID terms, the native xrefs from DOID to MESH
+    # are applied, even if terms are loaded with the ignore_mappings
+    # option which just turns of loading the mappings that are
+    # generated in this script.
+    known_mappings = set()
+    terms_by_id_tuple = {}
+    for term in terms:
+        terms_by_id_tuple[(term.db, term.id)] = term
+        if term.source_id:
+            terms_by_id_tuple[(term.source_db, term.source_id)] = term
+            known_mappings.add((term.db, term.id, term.source_db, term.source_id))
     # General ambiguities
     ambigs = find_ambiguities(terms, match_attr='text')
     mappings = get_mesh_mappings(ambigs)
@@ -171,11 +243,61 @@ def manual_go_mappings(terms):
     for k, v in mappings2.items():
         if k not in mappings:
             mappings[k] = v
-    mappings3 = manual_go_mappings(terms)
+    # Mappings from GO terms
+    mappings3 = manual_go_mappings(terms_by_id_tuple)
     for k, v in mappings3.items():
         if k not in mappings:
             mappings[k] = v
-    mappings, mapping_ambigs = resolve_duplicates(mappings)
-    dump_mappings(mappings, os.path.join(resources, 'mesh_mappings.tsv'))
-    dump_mappings(mapping_ambigs,
+
+    # We now have to account for Biomappings curations
+    positive_biomappings, negative_biomappings = load_biomappings()
+    keys_to_remove = set()
+    # Iterate over all the automatically proposed mappings
+    for mesh_id, local_mappings in mappings.items():
+        # If we already have a positive curation for the given MeSH ID
+        # we want to replace the content automatically generated here
+        # with the terms corresponding to the positive curation
+        if mesh_id in positive_biomappings:
+            other_ids = positive_biomappings[mesh_id]
+            new_mappings = {}
+            for other_id in other_ids:
+                # If the other ID already exists, we just copy it over
+                if other_id in mappings[mesh_id]:
+                    new_mappings[other_id] = mappings[mesh_id][other_id]
+                # If it doesn't exist yet, we look up a Term for it
+                # and add it to the mappings
+                else:
+                    if other_id in terms_by_id_tuple:
+                        mesh_term = terms_by_id_tuple[('MESH', mesh_id)]
+                        other_term = terms_by_id_tuple[other_id]
+                        new_mappings[other_id] = [mesh_term, other_term]
+                    # This is a corner case where something is in Biomappings
+                    # but not in the set of Gilda terms. This can happen
+                    # if a term has been deprecated/replaced in an ontology.
+                    # We ignore these mappings and just keep what we have.
+                    else:
+                        print('%s missing from set of terms' % str(other_id))
+                        new_mappings = mappings[mesh_id]
+            mappings[mesh_id] = new_mappings
+        # If we have a negative curation for this MeSH ID, we make sure
+        # that we remove any known incorrect mappings
+        if mesh_id in negative_biomappings:
+            other_ids = negative_biomappings[mesh_id]
+            if mesh_id in mappings:
+                for other_id in other_ids:
+                    if other_id in mappings[mesh_id]:
+                        mappings[mesh_id].pop(other_id, None)
+                # If nothing left, we remove the whole MeSH ID key
+                if not mappings[mesh_id]:
+                    keys_to_remove.add(mesh_id)
+    for key in keys_to_remove:
+        mappings.pop(key)
+    nonambig_mappings, ambig_mappings = resolve_duplicates(mappings)
+    dump_mappings(nonambig_mappings, os.path.join(resources, 'mesh_mappings.tsv'))
+    dump_mappings(ambig_mappings,
                   os.path.join(resources, 'mesh_ambig_mappings.tsv'))
+
+    # Known mappings are useful for debugging
+    #with open(os.path.join(resources, 'known_mappings.tsv'), 'w') as fh:
+    #    for db, id, source_db, source_id in sorted(known_mappings):
+    #        fh.write('\t'.join([db, id, source_db, source_id]) + '\n')