Skip to content

Commit

Permalink
Merge pull request #140 from gyorilab/mesh_mapping
Browse files Browse the repository at this point in the history
Improve MeSH mappings
  • Loading branch information
bgyori authored Jul 18, 2024
2 parents 306e6c7 + 40d95cd commit f6500c4
Show file tree
Hide file tree
Showing 8 changed files with 28,252 additions and 822 deletions.
4 changes: 2 additions & 2 deletions benchmarks/fplx_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
'Aminopeptidases': {'MESH': 'D000626'},
'NF-AT proteins': {'MESH': 'D050778'},
'LTbetaR': {'HGNC': '6718'},
'RNAi': {'MESH': 'D034622', 'GO': 'GO:0016246'},
'RNAi': {'MESH': 'D034622', 'GO': 'GO:0016441'},
'Chaetocin': {'CHEBI': 'CHEBI:68747'},
'BAY11-7082': {'CHEBI': 'CHEBI:85928'},
'Toll-like receptors': {'MESH': 'D051193'},
Expand Down Expand Up @@ -48,7 +48,7 @@
'integrin alpha': {'FPLX': 'ITGA'},
'DC': {'MESH': 'D003713'},
'BMD': {'MESH': 'D015519'},
'angina': {'EFO': '0003913'}}
'angina': {'MESH': 'D000787', 'EFO': '0003913'}}


incorrect_assertions = {'IGF': {'HGNC': '5464'},
Expand Down
2 changes: 1 addition & 1 deletion gilda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '1.2.1'
__version__ = '1.3.0'

import logging

Expand Down
14 changes: 7 additions & 7 deletions gilda/generate_terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,13 +565,14 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
map_to_ns = {'MESH', 'DOID'}
terms = []
db, db_id, name = prefix.upper(), entry['id'], entry['name']
entry_name = name
# We first need to decide if we prioritize another name space
xref_dict = {xr['namespace']: xr['id'] for xr in entry.get('xrefs', [])}
# Handle MeSH mappings first
auto_mesh_mapping = mesh_mappings_reverse.get((db, db_id))
if auto_mesh_mapping and not ignore_mappings:
db, db_id, name = ('MESH', auto_mesh_mapping[0],
auto_mesh_mapping[1])
db, db_id, entry_name = ('MESH', auto_mesh_mapping[0],
auto_mesh_mapping[1])
elif 'MESH' in map_to_ns and ('MESH' in xref_dict or 'MSH' in xref_dict):
mesh_id = xref_dict.get('MESH') or xref_dict.get('MSH')
# Since we currently only include regular MeSH terms (which start
Expand All @@ -583,7 +584,7 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
# Here we need to check if we further map the MeSH ID to
# another namespace
mesh_mapping = mesh_mappings.get(mesh_id)
db, db_id, name = mesh_mapping if \
db, db_id, entry_name = mesh_mapping if \
(mesh_mapping and (mesh_mapping[0]
not in {'EFO', 'HP', 'DOID'})) \
else ('MESH', mesh_id, mesh_name)
Expand All @@ -601,15 +602,15 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
# If we don't get a name here, it's likely because an entry is
# obsolete so we don't do the mapping
if doid_name:
db, db_id, name = 'DOID', doid, doid_name
db, db_id, entry_name = 'DOID', doid, doid_name

# Add a term for the name first
name_term = Term(
norm_text=normalize(name),
text=name,
db=db,
id=db_id,
entry_name=name,
entry_name=entry_name,
status='name',
source=prefix,
source_db=prefix.upper() if db != prefix.upper() else None,
Expand Down Expand Up @@ -639,7 +640,7 @@ def terms_from_obo_json_entry(entry, prefix, ignore_mappings=False,
text=synonym,
db=db,
id=db_id,
entry_name=name,
entry_name=entry_name,
status='synonym',
source=prefix,
source_db=prefix.upper() if db != prefix.upper() else None,
Expand Down Expand Up @@ -709,7 +710,6 @@ def get_all_terms():
]
for generated_terms in generated_term_groups:
terms += generated_terms

terms = filter_out_duplicates(terms)
return terms

Expand Down
1,584 changes: 1,427 additions & 157 deletions gilda/resources/mesh_ambig_mappings.tsv

Large diffs are not rendered by default.

27,280 changes: 26,656 additions & 624 deletions gilda/resources/mesh_mappings.tsv

Large diffs are not rendered by default.

10 changes: 8 additions & 2 deletions gilda/term.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,14 @@ def get_identifiers_url(db, id):
return f'https://identifiers.org/{curie}'


def _term_key(term: Term) -> Tuple[str, str, str]:
return term.db, term.id, term.text
def _term_key(term: Term) -> Tuple[str, str, str, str, str]:
# We include source_id and source_db to avoid losing
# potentially important links back to mapped source IDs
# but we have to make sure these are strings since otherwise
# they could be None which can't be sorted against strings
source_db = term.source_db or ''
source_id = term.source_id or ''
return term.db, term.id, source_db, source_id, term.text


statuses = {'curated': 1, 'name': 2, 'synonym': 3, 'former_name': 4}
Expand Down
4 changes: 2 additions & 2 deletions gilda/tests/test_grounder.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_grounder_bug():

def test_grounder_num_entries():
entries = gr.lookup('NPM1')
assert len(entries) == 4, entries
assert len(entries) == 5, entries
entries = gr.lookup('H4')
assert len(entries) == 7, entries

Expand Down Expand Up @@ -202,7 +202,7 @@ def test_unidecode():
for txt in txts:
matches = gr.ground(txt)
assert len(matches) == 2
assert {m.term.db for m in matches} == {'EFO', 'DOID'}
assert {m.term.db for m in matches} == {'EFO', 'MESH'}

txts = ['Bi₇O₉I₃', 'Bi7O9I3']
for txt in txts:
Expand Down
176 changes: 149 additions & 27 deletions scripts/generate_mesh_mappings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import pandas
from collections import defaultdict
from gilda.generate_terms import *
from indra.databases import mesh_client
Expand All @@ -16,17 +17,57 @@ def is_chemical(mesh_id):
return mesh_client.is_molecular(mesh_id)


def render_row(me, te):
return '\t'.join([me.db, me.id, me.entry_name,
te.db, te.id, te.entry_name])
def load_biomappings():
"""Load curated positive and negative mappings from Biomappings."""
url_base = ('https://raw.githubusercontent.com/biopragmatics/biomappings/'
'master/src/biomappings/resources/')
positive_df = pandas.read_csv(url_base + 'mappings.tsv', sep='\t')
negative_df = pandas.read_csv(url_base + 'incorrect.tsv', sep='\t')
positive_mappings = defaultdict(list)
negative_mappings = defaultdict(list)
# These are the only relevant prefixes, there are mappings to
# various other namespaces we don't need
prefixes = {'fplx', 'chebi', 'go', 'hp', 'doid', 'efo', 'hgnc'}
for mapping_df, mappings in ((positive_df, positive_mappings),
(negative_df, negative_mappings)):
for _, row in mapping_df.iterrows():
# We only need exact matches.
# TODO: should we consider non-exact matches to be effectively
# negative?
if row['relation'] != 'skos:exactMatch':
continue
# Look at both directions in which mesh mappings
# can appear
if row['source prefix'] == 'mesh':
mesh_id = row['source identifier']
other_ns = row['target prefix']
other_id = row['target identifier']
elif row['target prefix'] == 'mesh':
mesh_id = row['target identifier']
other_ns = row['source prefix']
other_id = row['source identifier']
else:
continue
if other_ns not in prefixes:
continue
# We make the namespace upper to be consistent
# with Gilda
mappings[mesh_id].append((other_ns.upper(), other_id))
return positive_mappings, negative_mappings


def get_nonambiguous(maps):
# If there are more than one mappings from MESH
# If there is more than one mapping from MESH
if len(maps) > 1:
# We see if there are any name-level matches
name_matches = [(me, te) for me, te in maps
if me.entry_name.lower() == te.entry_name.lower()]
if (me.entry_name.lower() if me.entry_name else '')
== (te.entry_name.lower() if te.entry_name else '')
# Corner case where we have multiple MeSH-based terms
# due to an orignal term from e.g., DOID having been
# mapped to MeSH
and me.db != te.db]

# If we still have ambiguity, we print to the user
if not name_matches or len(name_matches) > 1:
return None, maps
Expand All @@ -42,7 +83,7 @@ def resolve_duplicates(mappings):
keep_mappings = []
all_ambigs = []
# First we deal with mappings from MESH
for maps in mappings.values():
for key, maps in mappings.items():
maps_list = maps.values()
keep, ambigs = get_nonambiguous(maps_list)
if keep:
Expand All @@ -67,9 +108,17 @@ def resolve_duplicates(mappings):


def dump_mappings(mappings, fname):
def render_row(me, te):
return '\t'.join([me.db, me.id, me.entry_name,
te.db, te.id, te.entry_name])

with open(fname, 'w') as fh:
for mesh_term, other_term in sorted(mappings, key=lambda x: x[0].id):
fh.write(render_row(mesh_term, other_term) + '\n')
# Corner case where we have multiple MeSH-based terms
# due to an orignal term from e.g., DOID having been
# mapped to MeSH
if other_term.db != 'MESH':
fh.write(render_row(mesh_term, other_term) + '\n')


def get_ambigs_by_db(ambigs):
Expand All @@ -83,9 +132,9 @@ def get_mesh_mappings(ambigs):
mappings_by_mesh_id = defaultdict(dict)
for text, ambig_terms in ambigs.items():
ambigs_by_db = get_ambigs_by_db(ambig_terms)
print('Considering %s' % text)
for term in ambig_terms:
print('%s:%s %s' % (term.db, term.id, term.entry_name))
#print('Considering %s' % text)
#for term in ambig_terms:
# print('%s:%s %s' % (term.db, term.id, term.entry_name))
order = [('FPLX', is_protein),
('HGNC', is_protein),
('CHEBI', is_chemical),
Expand All @@ -98,11 +147,11 @@ def get_mesh_mappings(ambigs):
if len(ambigs_by_db.get(ns, [])) == 1 and mesh_constraint(me.id):
mappings_by_mesh_id[me.id][(ambigs_by_db[ns][0].db,
ambigs_by_db[ns][0].id)] = \
(me, ambigs_by_db[ns][0])
print('Adding mapping for %s' % ns)
[me, ambigs_by_db[ns][0]]
#print('Adding mapping for %s' % ns)
break
print('--------------')
return mappings_by_mesh_id
#print('--------------')
return dict(mappings_by_mesh_id)


def find_ambiguities(terms, match_attr='text'):
Expand All @@ -112,13 +161,24 @@ def find_ambiguities(terms, match_attr='text'):
# We consider it an ambiguity if the same text entry appears
# multiple times
ambig_entries[match_fun(term)].append(term)
# There is a corner case where the match_fun matches two different
# synonyms / variants of the same entry from the same database which
# are not really considered ambiguity but need to be reduced to a single
# entry to avoid being inadvertently filtered out later
ambig_entries = {
# Here, we make sure we only keep a single term with a given db and id
norm_term: list({(term.db, term.id): term for term in matching_terms}.values())
for norm_term, matching_terms in ambig_entries.items()
}
# It's only an ambiguity if there are two entries at least
ambig_entries = {k: v for k, v in ambig_entries.items() if len(v) >= 2}
ambig_entries = {norm_term: matching_terms
for norm_term, matching_terms
in ambig_entries.items() if len(matching_terms) >= 2}
# We filter out any ambiguities that contain not exactly one MeSH term
ambig_entries = {k: v for k, v in ambig_entries.items()
if len([e for e in v if e.db == 'MESH']) == 1}
print('Found a total of %d relevant ambiguities' % len(ambig_entries))
return ambig_entries
return dict(ambig_entries)


def get_terms():
Expand All @@ -135,10 +195,7 @@ def get_terms():
return terms


def manual_go_mappings(terms):
td = defaultdict(list)
for term in terms:
td[(term.db, term.id)].append(term)
def manual_go_mappings(terms_by_id_tuple):
# Migrated from FamPlex and INDRA
map = [
('D002465', 'GO:0048870'),
Expand All @@ -153,14 +210,29 @@ def manual_go_mappings(terms):
]
mappings_by_mesh_id = defaultdict(dict)
for mid, gid in map:
mt = td[('MESH', mid)][0]
gt = td[('GO', gid)][0]
mt = terms_by_id_tuple[('MESH', mid)]
gt = terms_by_id_tuple[('GO', gid)]
mappings_by_mesh_id[mid][('GO', gid)] = (mt, gt)
return mappings_by_mesh_id
return dict(mappings_by_mesh_id)


if __name__ == '__main__':
terms = get_terms()
# We create a lookup of term objects by their db/id tuple
# for quick lookups. We also add source db/ids here
# because they can be relevant when finding terms for
# Biomappings curations. Note that when loading e.g.,
# DOID terms, the native xrefs from DOID to MESH
# are applied, even if terms are loaded with the ignore_mappings
# option which just turns of loading the mappings that are
# generated in this script.
known_mappings = set()
terms_by_id_tuple = {}
for term in terms:
terms_by_id_tuple[(term.db, term.id)] = term
if term.source_id:
terms_by_id_tuple[(term.source_db, term.source_id)] = term
known_mappings.add((term.db, term.id, term.source_db, term.source_id))
# General ambiguities
ambigs = find_ambiguities(terms, match_attr='text')
mappings = get_mesh_mappings(ambigs)
Expand All @@ -171,11 +243,61 @@ def manual_go_mappings(terms):
for k, v in mappings2.items():
if k not in mappings:
mappings[k] = v
mappings3 = manual_go_mappings(terms)
# Mappings from GO terms
mappings3 = manual_go_mappings(terms_by_id_tuple)
for k, v in mappings3.items():
if k not in mappings:
mappings[k] = v
mappings, mapping_ambigs = resolve_duplicates(mappings)
dump_mappings(mappings, os.path.join(resources, 'mesh_mappings.tsv'))
dump_mappings(mapping_ambigs,

# We now have to account for Biomappings curations
positive_biomappings, negative_biomappings = load_biomappings()
keys_to_remove = set()
# Iterate over all the automatically proposed mappings
for mesh_id, local_mappings in mappings.items():
# If we already have a positive curation for the given MeSH ID
# we want to replace the content automatically generated here
# with the terms corresponding to the positive curation
if mesh_id in positive_biomappings:
other_ids = positive_biomappings[mesh_id]
new_mappings = {}
for other_id in other_ids:
# If the other ID already exists, we just copy it over
if other_id in mappings[mesh_id]:
new_mappings[other_id] = mappings[mesh_id][other_id]
# If it doesn't exist yet, we look up a Term for it
# and add it to the mappings
else:
if other_id in terms_by_id_tuple:
mesh_term = terms_by_id_tuple[('MESH', mesh_id)]
other_term = terms_by_id_tuple[other_id]
new_mappings[other_id] = [mesh_term, other_term]
# This is a corner case where something is in Biomappings
# but not in the set of Gilda terms. This can happen
# if a term has been deprecated/replaced in an ontology.
# We ignore these mappings and just keep what we have.
else:
print('%s missing from set of terms' % str(other_id))
new_mappings = mappings[mesh_id]
mappings[mesh_id] = new_mappings
# If we have a negative curation for this MeSH ID, we make sure
# that we remove any known incorrect mappings
if mesh_id in negative_biomappings:
other_ids = negative_biomappings[mesh_id]
if mesh_id in mappings:
for other_id in other_ids:
if other_id in mappings[mesh_id]:
mappings[mesh_id].pop(other_id, None)
# If nothing left, we remove the whole MeSH ID key
if not mappings[mesh_id]:
keys_to_remove.add(mesh_id)
for key in keys_to_remove:
mappings.pop(key)
nonambig_mappings, ambig_mappings = resolve_duplicates(mappings)
dump_mappings(nonambig_mappings, os.path.join(resources, 'mesh_mappings.tsv'))
dump_mappings(ambig_mappings,
os.path.join(resources, 'mesh_ambig_mappings.tsv'))

# Known mappings are useful for debugging
#with open(os.path.join(resources, 'known_mappings.tsv'), 'w') as fh:
# for db, id, source_db, source_id in sorted(known_mappings):
# fh.write('\t'.join([db, id, source_db, source_id]) + '\n')

0 comments on commit f6500c4

Please sign in to comment.