Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve MeSH mappings #140

Merged
merged 11 commits into from
Jul 18, 2024
4 changes: 2 additions & 2 deletions benchmarks/fplx_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
'Aminopeptidases': {'MESH': 'D000626'},
'NF-AT proteins': {'MESH': 'D050778'},
'LTbetaR': {'HGNC': '6718'},
'RNAi': {'MESH': 'D034622', 'GO': 'GO:0016246'},
'RNAi': {'MESH': 'D034622', 'GO': 'GO:0016441'},
'Chaetocin': {'CHEBI': 'CHEBI:68747'},
'BAY11-7082': {'CHEBI': 'CHEBI:85928'},
'Toll-like receptors': {'MESH': 'D051193'},
Expand Down Expand Up @@ -48,7 +48,7 @@
'integrin alpha': {'FPLX': 'ITGA'},
'DC': {'MESH': 'D003713'},
'BMD': {'MESH': 'D015519'},
'angina': {'EFO': '0003913'}}
'angina': {'MESH': 'D000787', 'EFO': '0003913'}}


incorrect_assertions = {'IGF': {'HGNC': '5464'},
Expand Down
2 changes: 1 addition & 1 deletion gilda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.2.1'
__version__ = '1.3.0'

import logging

Expand Down
14 changes: 7 additions & 7 deletions gilda/generate_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,13 +565,14 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
map_to_ns = {'MESH', 'DOID'}
terms = []
db, db_id, name = prefix.upper(), entry['id'], entry['name']
entry_name = name
# We first need to decide if we prioritize another name space
xref_dict = {xr['namespace']: xr['id'] for xr in entry.get('xrefs', [])}
# Handle MeSH mappings first
auto_mesh_mapping = mesh_mappings_reverse.get((db, db_id))
if auto_mesh_mapping and not ignore_mappings:
db, db_id, name = ('MESH', auto_mesh_mapping[0],
auto_mesh_mapping[1])
db, db_id, entry_name = ('MESH', auto_mesh_mapping[0],
auto_mesh_mapping[1])
elif 'MESH' in map_to_ns and ('MESH' in xref_dict or 'MSH' in xref_dict):
mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
# Since we currently only include regular MeSH terms (which start
Expand All @@ -583,7 +584,7 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
# Here we need to check if we further map the MeSH ID to
# another namespace
mesh_mapping = mesh_mappings.get(mesh_id)
db, db_id, name = mesh_mapping if \
db, db_id, entry_name = mesh_mapping if \
(mesh_mapping and (mesh_mapping[0]
not in {'EFO', 'HP', 'DOID'})) \
else ('MESH', mesh_id, mesh_name)
Expand All @@ -601,15 +602,15 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
# If we don't get a name here, it's likely because an entry is
# obsolete so we don't do the mapping
if doid_name:
db, db_id, name = 'DOID', doid, doid_name
db, db_id, entry_name = 'DOID', doid, doid_name

# Add a term for the name first
name_term = Term(
norm_text=normalize(name),
text=name,
db=db,
id=db_id,
entry_name=name,
entry_name=entry_name,
status='name',
source=prefix,
source_db=prefix.upper() if db != prefix.upper() else None,
Expand Down Expand Up @@ -639,7 +640,7 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
text=synonym,
db=db,
id=db_id,
entry_name=name,
entry_name=entry_name,
status='synonym',
source=prefix,
source_db=prefix.upper() if db != prefix.upper() else None,
Expand Down Expand Up @@ -709,7 +710,6 @@ def get_all_terms():
]
for generated_terms in generated_term_groups:
terms += generated_terms

terms = filter_out_duplicates(terms)
return terms

Expand Down
1,584 changes: 1,427 additions & 157 deletions gilda/resources/mesh_ambig_mappings.tsv

Large diffs are not rendered by default.

27,280 changes: 26,656 additions & 624 deletions gilda/resources/mesh_mappings.tsv

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions gilda/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,14 @@ def get_identifiers_url(db, id):
return f'https://identifiers.org/{curie}'


def _term_key(term: Term) -> Tuple[str, str, str]:
return term.db, term.id, term.text
def _term_key(term: Term) -> Tuple[str, str, str, str, str]:
# We include source_id and source_db to avoid losing
# potentially important links back to mapped source IDs
# but we have to make sure these are strings since otherwise
# they could be None which can't be sorted against strings
source_db = term.source_db or ''
source_id = term.source_id or ''
return term.db, term.id, source_db, source_id, term.text


statuses = {'curated': 1, 'name': 2, 'synonym': 3, 'former_name': 4}
Expand Down
4 changes: 2 additions & 2 deletions gilda/tests/test_grounder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_grounder_bug():

def test_grounder_num_entries():
entries = gr.lookup('NPM1')
assert len(entries) == 4, entries
assert len(entries) == 5, entries
entries = gr.lookup('H4')
assert len(entries) == 7, entries

Expand Down Expand Up @@ -202,7 +202,7 @@ def test_unidecode():
for txt in txts:
matches = gr.ground(txt)
assert len(matches) == 2
assert {m.term.db for m in matches} == {'EFO', 'DOID'}
assert {m.term.db for m in matches} == {'EFO', 'MESH'}

txts = ['Bi₇O₉I₃', 'Bi7O9I3']
for txt in txts:
Expand Down
176 changes: 149 additions & 27 deletions scripts/generate_mesh_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pandas
from collections import defaultdict
from gilda.generate_terms import *
from indra.databases import mesh_client
Expand All @@ -16,17 +17,57 @@ def is_chemical(mesh_id):
return mesh_client.is_molecular(mesh_id)


def render_row(me, te):
return '\t'.join([me.db, me.id, me.entry_name,
te.db, te.id, te.entry_name])
def load_biomappings():
"""Load curated positive and negative mappings from Biomappings."""
url_base = ('https://raw.githubusercontent.com/biopragmatics/biomappings/'
'master/src/biomappings/resources/')
positive_df = pandas.read_csv(url_base + 'mappings.tsv', sep='\t')
negative_df = pandas.read_csv(url_base + 'incorrect.tsv', sep='\t')
positive_mappings = defaultdict(list)
negative_mappings = defaultdict(list)
# These are the only relevant prefixes, there are mappings to
# various other namespaces we don't need
prefixes = {'fplx', 'chebi', 'go', 'hp', 'doid', 'efo', 'hgnc'}
for mapping_df, mappings in ((positive_df, positive_mappings),
(negative_df, negative_mappings)):
for _, row in mapping_df.iterrows():
# We only need exact matches.
# TODO: should we consider non-exact matches to be effectively
# negative?
if row['relation'] != 'skos:exactMatch':
continue
# Look at both directions in which mesh mappings
# can appear
if row['source prefix'] == 'mesh':
mesh_id = row['source identifier']
other_ns = row['target prefix']
other_id = row['target identifier']
elif row['target prefix'] == 'mesh':
mesh_id = row['target identifier']
other_ns = row['source prefix']
other_id = row['source identifier']
else:
continue
if other_ns not in prefixes:
continue
# We make the namespace upper to be consistent
# with Gilda
mappings[mesh_id].append((other_ns.upper(), other_id))
return positive_mappings, negative_mappings


def get_nonambiguous(maps):
# If there are more than one mappings from MESH
# If there is more than one mapping from MESH
if len(maps) > 1:
# We see if there are any name-level matches
name_matches = [(me, te) for me, te in maps
if me.entry_name.lower() == te.entry_name.lower()]
if (me.entry_name.lower() if me.entry_name else '')
== (te.entry_name.lower() if te.entry_name else '')
# Corner case where we have multiple MeSH-based terms
# due to an orignal term from e.g., DOID having been
# mapped to MeSH
and me.db != te.db]

# If we still have ambiguity, we print to the user
if not name_matches or len(name_matches) > 1:
return None, maps
Expand All @@ -42,7 +83,7 @@ def resolve_duplicates(mappings):
keep_mappings = []
all_ambigs = []
# First we deal with mappings from MESH
for maps in mappings.values():
for key, maps in mappings.items():
maps_list = maps.values()
keep, ambigs = get_nonambiguous(maps_list)
if keep:
Expand All @@ -67,9 +108,17 @@ def resolve_duplicates(mappings):


def dump_mappings(mappings, fname):
def render_row(me, te):
return '\t'.join([me.db, me.id, me.entry_name,
te.db, te.id, te.entry_name])

with open(fname, 'w') as fh:
for mesh_term, other_term in sorted(mappings, key=lambda x: x[0].id):
fh.write(render_row(mesh_term, other_term) + '\n')
# Corner case where we have multiple MeSH-based terms
# due to an orignal term from e.g., DOID having been
# mapped to MeSH
if other_term.db != 'MESH':
fh.write(render_row(mesh_term, other_term) + '\n')


def get_ambigs_by_db(ambigs):
Expand All @@ -83,9 +132,9 @@ def get_mesh_mappings(ambigs):
mappings_by_mesh_id = defaultdict(dict)
for text, ambig_terms in ambigs.items():
ambigs_by_db = get_ambigs_by_db(ambig_terms)
print('Considering %s' % text)
for term in ambig_terms:
print('%s:%s %s' % (term.db, term.id, term.entry_name))
#print('Considering %s' % text)
#for term in ambig_terms:
# print('%s:%s %s' % (term.db, term.id, term.entry_name))
order = [('FPLX', is_protein),
('HGNC', is_protein),
('CHEBI', is_chemical),
Expand All @@ -98,11 +147,11 @@ def get_mesh_mappings(ambigs):
if len(ambigs_by_db.get(ns, [])) == 1 and mesh_constraint(me.id):
mappings_by_mesh_id[me.id][(ambigs_by_db[ns][0].db,
ambigs_by_db[ns][0].id)] = \
(me, ambigs_by_db[ns][0])
print('Adding mapping for %s' % ns)
[me, ambigs_by_db[ns][0]]
#print('Adding mapping for %s' % ns)
break
print('--------------')
return mappings_by_mesh_id
#print('--------------')
return dict(mappings_by_mesh_id)


def find_ambiguities(terms, match_attr='text'):
Expand All @@ -112,13 +161,24 @@ def find_ambiguities(terms, match_attr='text'):
# We consider it an ambiguity if the same text entry appears
# multiple times
ambig_entries[match_fun(term)].append(term)
# There is a corner case where the match_fun matches two different
# synonyms / variants of the same entry from the same database which
# are not really considered ambiguity but need to be reduced to a single
# entry to avoid being inadvertently filtered out later
ambig_entries = {
# Here, we make sure we only keep a single term with a given db and id
norm_term: list({(term.db, term.id): term for term in matching_terms}.values())
for norm_term, matching_terms in ambig_entries.items()
}
# It's only an ambiguity if there are two entries at least
ambig_entries = {k: v for k, v in ambig_entries.items() if len(v) >= 2}
ambig_entries = {norm_term: matching_terms
for norm_term, matching_terms
in ambig_entries.items() if len(matching_terms) >= 2}
# We filter out any ambiguities that contain not exactly one MeSH term
ambig_entries = {k: v for k, v in ambig_entries.items()
if len([e for e in v if e.db == 'MESH']) == 1}
print('Found a total of %d relevant ambiguities' % len(ambig_entries))
return ambig_entries
return dict(ambig_entries)


def get_terms():
Expand All @@ -135,10 +195,7 @@ def get_terms():
return terms


def manual_go_mappings(terms):
td = defaultdict(list)
for term in terms:
td[(term.db, term.id)].append(term)
def manual_go_mappings(terms_by_id_tuple):
# Migrated from FamPlex and INDRA
map = [
('D002465', 'GO:0048870'),
Expand All @@ -153,14 +210,29 @@ def manual_go_mappings(terms):
]
mappings_by_mesh_id = defaultdict(dict)
for mid, gid in map:
mt = td[('MESH', mid)][0]
gt = td[('GO', gid)][0]
mt = terms_by_id_tuple[('MESH', mid)]
gt = terms_by_id_tuple[('GO', gid)]
mappings_by_mesh_id[mid][('GO', gid)] = (mt, gt)
return mappings_by_mesh_id
return dict(mappings_by_mesh_id)


if __name__ == '__main__':
terms = get_terms()
# We create a lookup of term objects by their db/id tuple
# for quick lookups. We also add source db/ids here
# because they can be relevant when finding terms for
# Biomappings curations. Note that when loading e.g.,
# DOID terms, the native xrefs from DOID to MESH
# are applied, even if terms are loaded with the ignore_mappings
# option which just turns of loading the mappings that are
# generated in this script.
known_mappings = set()
terms_by_id_tuple = {}
for term in terms:
terms_by_id_tuple[(term.db, term.id)] = term
if term.source_id:
terms_by_id_tuple[(term.source_db, term.source_id)] = term
known_mappings.add((term.db, term.id, term.source_db, term.source_id))
# General ambiguities
ambigs = find_ambiguities(terms, match_attr='text')
mappings = get_mesh_mappings(ambigs)
Expand All @@ -171,11 +243,61 @@ def manual_go_mappings(terms):
for k, v in mappings2.items():
if k not in mappings:
mappings[k] = v
mappings3 = manual_go_mappings(terms)
# Mappings from GO terms
mappings3 = manual_go_mappings(terms_by_id_tuple)
for k, v in mappings3.items():
if k not in mappings:
mappings[k] = v
mappings, mapping_ambigs = resolve_duplicates(mappings)
dump_mappings(mappings, os.path.join(resources, 'mesh_mappings.tsv'))
dump_mappings(mapping_ambigs,

# We now have to account for Biomappings curations
positive_biomappings, negative_biomappings = load_biomappings()
keys_to_remove = set()
# Iterate over all the automatically proposed mappings
for mesh_id, local_mappings in mappings.items():
# If we already have a positive curation for the given MeSH ID
# we want to replace the content automatically generated here
# with the terms corresponding to the positive curation
if mesh_id in positive_biomappings:
other_ids = positive_biomappings[mesh_id]
new_mappings = {}
for other_id in other_ids:
# If the other ID already exists, we just copy it over
if other_id in mappings[mesh_id]:
new_mappings[other_id] = mappings[mesh_id][other_id]
# If it doesn't exist yet, we look up a Term for it
# and add it to the mappings
else:
if other_id in terms_by_id_tuple:
mesh_term = terms_by_id_tuple[('MESH', mesh_id)]
other_term = terms_by_id_tuple[other_id]
new_mappings[other_id] = [mesh_term, other_term]
# This is a corner case where something is in Biomappings
# but not in the set of Gilda terms. This can happen
# if a term has been deprecated/replaced in an ontology.
# We ignore these mappings and just keep what we have.
else:
print('%s missing from set of terms' % str(other_id))
new_mappings = mappings[mesh_id]
mappings[mesh_id] = new_mappings
# If we have a negative curation for this MeSH ID, we make sure
# that we remove any known incorrect mappings
if mesh_id in negative_biomappings:
other_ids = negative_biomappings[mesh_id]
if mesh_id in mappings:
for other_id in other_ids:
if other_id in mappings[mesh_id]:
mappings[mesh_id].pop(other_id, None)
# If nothing left, we remove the whole MeSH ID key
if not mappings[mesh_id]:
keys_to_remove.add(mesh_id)
for key in keys_to_remove:
mappings.pop(key)
nonambig_mappings, ambig_mappings = resolve_duplicates(mappings)
dump_mappings(nonambig_mappings, os.path.join(resources, 'mesh_mappings.tsv'))
dump_mappings(ambig_mappings,
os.path.join(resources, 'mesh_ambig_mappings.tsv'))

# Known mappings are useful for debugging
#with open(os.path.join(resources, 'known_mappings.tsv'), 'w') as fh:
# for db, id, source_db, source_id in sorted(known_mappings):
# fh.write('\t'.join([db, id, source_db, source_id]) + '\n')
Loading