Skip to content

Commit

Permalink
Update MeSH predictions and curations (#168)
Browse files Browse the repository at this point in the history
Related to gyorilab/gilda#140, there are a lot
of new predictions, and I curated a few instances here and there to
improve results.
  • Loading branch information
bgyori authored Jul 18, 2024
2 parents 0f537af + 09bb2fb commit ca7f46d
Show file tree
Hide file tree
Showing 5 changed files with 13,883 additions and 12 deletions.
33 changes: 32 additions & 1 deletion scripts/import_gilda_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
from typing import Iterable

from biomappings import load_false_mappings, load_mappings
from biomappings.resources import PredictionTuple, append_prediction_tuples
from biomappings.utils import get_script_url

Expand All @@ -26,6 +27,8 @@
"NCIT": "ncit",
"GO": "go",
"FPLX": "fplx",
"UP": "uniprot",
"MESH": "mesh",
}


Expand All @@ -45,16 +48,44 @@ def get_primary_mappings():
return mappings


def get_curated_mappings():
"""Get curated mappings."""
curated_mappings = set()
for mapping in load_mappings() + load_false_mappings():
mapping_tuples = {
(
mapping["source prefix"],
mapping["source identifier"],
mapping["target prefix"],
mapping["target identifier"],
),
(
mapping["target prefix"],
mapping["target identifier"],
mapping["source prefix"],
mapping["source identifier"],
),
}
curated_mappings |= mapping_tuples
return curated_mappings


def get_mappings() -> Iterable[PredictionTuple]:
"""Iterate lexical mappings from Gilda."""
url = get_script_url(__file__)
mapping_type = "semapv:LexicalMatching"
match_type = "skos:exactMatch"
confidence = 0.95
primary_mappings = get_primary_mappings()
curated_mappings = get_curated_mappings()
with open(GILDA_MAPPINGS, "r") as fh:
for _, mesh_id, mesh_name, db_ns, db_id, db_name in csv.reader(fh, delimiter="\t"):
if ("mesh", mesh_id, db_ns, db_id) in primary_mappings:
if ("mesh", mesh_id, db_ns_mappings[db_ns], db_id) in primary_mappings or (
"mesh",
mesh_id,
db_ns_mappings[db_ns],
db_id,
) in curated_mappings:
continue
yield PredictionTuple(
"mesh",
Expand Down
5 changes: 5 additions & 0 deletions src/biomappings/resources/incorrect.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,9 @@ mesh D000068256 Darbepoetin alfa skos:exactMatch hgnc 4392 GNAS semapv:ManualMap
mesh D000068437 Pemetrexed skos:exactMatch chebi CHEBI:17509 5'-S-methyl-5'-thioadenosine semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D000068800 Etanercept skos:exactMatch hgnc 11917 TNFRSF1B semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D000070636 Rotator Cuff Injuries skos:exactMatch efo 1001250 rotator cuff tear semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D000071071 Microaneurysm skos:exactMatch hp HP:0032416 Retinal microaneurysm semapv:ManualMappingCuration orcid:0000-0001-9439-5346
mesh D000071636 Protein Phosphatase 2C skos:exactMatch hgnc 9279 PDP1 semapv:ManualMappingCuration orcid:0000-0001-9439-5346
mesh D000071960 Breast Carcinoma In Situ skos:exactMatch ncit C3641 Stage 0 Breast Cancer AJCC v6 and v7 semapv:ManualMappingCuration orcid:0000-0001-9439-5346 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/a80ed2/scripts/import_gilda_mappings.py 0.95
mesh D000074767 Diapause skos:exactMatch go GO:0030431 sleep semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D000077190 Interferon alpha-2 skos:exactMatch hgnc 5423 IFNA2 semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D000077212 Ropivacaine skos:exactMatch chebi CHEBI:8890 (S)-ropivacaine semapv:ManualMappingCuration orcid:0000-0003-4423-4370
Expand Down Expand Up @@ -866,13 +868,16 @@ mesh D054467 Phospholipases A2 skos:exactMatch hgnc 9030 PLA2G1B semapv:ManualMa
mesh D054629 Genome, Mitochondrial skos:exactMatch go GO:0000262 mitochondrial chromosome semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D054740 Dendritic Cell Sarcoma, Follicular skos:exactMatch doid DOID:7849 dendritic cell sarcoma semapv:ManualMappingCuration orcid:0000-0003-1307-2508
mesh D054818 Hexosaminidase A skos:exactMatch go GO:0004563 beta-N-acetylhexosaminidase activity semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D055607 Receptors, Natural Killer Cell skos:exactMatch hgnc 6378 KLRD1 semapv:ManualMappingCuration orcid:0000-0001-9439-5346 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/5c44c0/scripts/import_gilda_mappings.py 0.95
mesh D055752 Small Cell Lung Carcinoma skos:exactMatch doid DOID:5411 lung oat cell carcinoma semapv:ManualMappingCuration orcid:0000-0003-1307-2508
mesh D057135 Wet Macular Degeneration skos:exactMatch doid DOID:10873 Kuhnt-Junius degeneration semapv:ManualMappingCuration orcid:0000-0003-1307-2508
mesh D058494 Walker-Warburg Syndrome skos:exactMatch doid DOID:0111237 congenital muscular dystrophy-dystroglycanopathy type A1 semapv:ManualMappingCuration orcid:0000-0003-1307-2508
mesh D058570 TOR Serine-Threonine Kinases skos:exactMatch hgnc 3942 MTOR semapv:ManualMappingCuration orcid:0000-0001-9439-5346
mesh D063807 Dandruff skos:exactMatch doid DOID:8941 seborrheic infantile dermatitis semapv:ManualMappingCuration orcid:0000-0001-9439-5346
mesh D063847 Mean Platelet Volume skos:exactMatch ncit C74730 Mean Platelet Volume Measurement semapv:ManualMappingCuration orcid:0000-0001-9439-5346
mesh D063948 Enslaved Persons skos:exactMatch ncit C153898 Slavey Language semapv:ManualMappingCuration orcid:0000-0001-9439-5346
mesh D064046 Secretagogins skos:exactMatch hgnc 16941 SCGN semapv:ManualMappingCuration orcid:0000-0001-9439-5346 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/5c44c0/scripts/import_gilda_mappings.py 0.95
mesh D064429 Fatty Acid Synthases skos:exactMatch hgnc 3594 FASN semapv:ManualMappingCuration orcid:0000-0001-9439-5346 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/5c44c0/scripts/import_gilda_mappings.py 0.95
mesh D064697 Racemethionine skos:exactMatch chebi CHEBI:16811 methionine semapv:ManualMappingCuration orcid:0000-0003-4423-4370
mesh D065627 Familial Primary Pulmonary Hypertension skos:exactMatch doid DOID:14557 primary pulmonary hypertension semapv:ManualMappingCuration orcid:0000-0003-1307-2508
mesh D065637 Cytochrome P-450 CYP2A6 skos:exactMatch hgnc 2610 CYP2A6 semapv:ManualMappingCuration orcid:0000-0003-1307-2508
Expand Down
Loading

0 comments on commit ca7f46d

Please sign in to comment.