From b250ea2481438b3ab82815843e2dc4bfe25539a8 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 3 Jul 2023 16:16:22 +0200 Subject: [PATCH] Add additional MeSH mappings --- scripts/generate_clo_mesh_mappings.py | 42 +++++++++++++++++++++++ src/biomappings/gilda_utils.py | 5 ++- src/biomappings/mapping_graph.py | 25 +++++++++++--- src/biomappings/resources/incorrect.tsv | 4 +++ src/biomappings/resources/mappings.tsv | 14 ++++++++ src/biomappings/resources/predictions.tsv | 6 +++- src/biomappings/utils.py | 4 +++ 7 files changed, 92 insertions(+), 8 deletions(-) create mode 100644 scripts/generate_clo_mesh_mappings.py diff --git a/scripts/generate_clo_mesh_mappings.py b/scripts/generate_clo_mesh_mappings.py new file mode 100644 index 00000000..36884e0d --- /dev/null +++ b/scripts/generate_clo_mesh_mappings.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- + +"""Generate mappings to CLO from to MeSH.""" + +import click +from more_click import verbose_option +from semra.sources.clo import get_clo_mappings + +from biomappings.gilda_utils import append_gilda_predictions +from biomappings.mapping_graph import get_filter_from_semra +from biomappings.utils import get_script_url + + +@click.command() +@verbose_option +def main(): + """Generate CLO-MeSH mappings.""" + provenance = get_script_url(__file__) + + + + prefix = "clo" + targets = [ + "mesh", + "efo", + "bto", + "cellosaurus", + ] + + clo_mappings = get_clo_mappings() + custom_filter = get_filter_from_semra(clo_mappings) + + append_gilda_predictions( + prefix, + targets, + provenance=provenance, + custom_filter=custom_filter, + ) + + +if __name__ == "__main__": + main() diff --git a/src/biomappings/gilda_utils.py b/src/biomappings/gilda_utils.py index 3acc5515..166abbd9 100644 --- a/src/biomappings/gilda_utils.py +++ b/src/biomappings/gilda_utils.py @@ -4,7 +4,7 @@ import logging from collections import defaultdict -from typing import Iterable, Mapping, Optional, Tuple, Union +from typing import Iterable, Optional, Tuple, Union import bioregistry import pyobo @@ -13,11 +13,10 @@ from pyobo.gilda_utils import get_grounder, iter_gilda_prediction_tuples from biomappings.resources import PredictionTuple, append_prediction_tuples +from biomappings.utils import CMapping logger = logging.getLogger(__name__) -CMapping = Mapping[str, Mapping[str, Mapping[str, str]]] - def append_gilda_predictions( prefix: str, diff --git a/src/biomappings/mapping_graph.py b/src/biomappings/mapping_graph.py index f3d693b4..f2c6bfcb 100644 --- a/src/biomappings/mapping_graph.py +++ b/src/biomappings/mapping_graph.py @@ -4,15 +4,32 @@ import itertools as itt from collections import defaultdict -from typing import DefaultDict, Dict, Iterable, Mapping, Optional +from typing import TYPE_CHECKING, DefaultDict, Dict, Iterable, List, Optional import networkx as nx import pyobo +from biomappings.utils import CMapping -def get_custom_filter( - prefix: str, targets: Iterable[str] -) -> Mapping[str, Mapping[str, Mapping[str, str]]]: +if TYPE_CHECKING: + import semra + +__all__ = [ + "get_custom_filter", + "get_filter_from_semra", + "mutual_mapping_graph", +] + + +def get_filter_from_semra(mappings: List["semra.Mapping"]) -> CMapping: + """Get a custom filter dictionary from a set of SeMRA mappings.""" + rv = defaultdict(lambda: defaultdict(dict)) + for mapping in mappings: + rv[mapping.s.prefix][mapping.o.prefix][mapping.s.identifier] = mapping.o.identifier + return rv + + +def get_custom_filter(prefix: str, targets: Iterable[str]) -> CMapping: """Get a custom filter dictionary induced over the mutual mapping graph with all target prefixes. :param prefix: The source prefix diff --git a/src/biomappings/resources/incorrect.tsv b/src/biomappings/resources/incorrect.tsv index 3316f460..9371bcbf 100644 --- a/src/biomappings/resources/incorrect.tsv +++ b/src/biomappings/resources/incorrect.tsv @@ -194,8 +194,12 @@ cl CL:2000004 pituitary gland cell skos:exactMatch mesh D010902 Pituitary Gland cl CL:2000021 sebaceous gland cell skos:exactMatch mesh D012627 Sebaceous Glands semapv:ManualMappingCuration orcid:0000-0001-9439-5346 cl CL:2000022 cardiac septum cell skos:exactMatch mesh D006346 Heart Septum semapv:ManualMappingCuration orcid:0000-0001-9439-5346 cl CL:2000030 hypothalamus cell skos:exactMatch mesh D007031 Hypothalamus semapv:ManualMappingCuration orcid:0000-0001-9439-5346 +clo 0001922 BE2 cell skos:exactMatch mesh D016175 B-Lymphocyte Subsets semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.502 clo 0002596 COS-1 cell skos:exactMatch mesh D019556 COS Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 clo 0002597 COS-7 cell skos:exactMatch mesh D019556 COS Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 +clo 0002941 EPI cell skos:exactMatch mesh D015251 Epirubicin semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.556 +clo 0003413 G cell skos:exactMatch mesh D019863 Gastrin-Secreting Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0009279 TC-1 cell skos:exactMatch mesh D013602 T-Lymphocytes, Cytotoxic semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 clo 0037163 Ishikawa cell skos:exactMatch cellosaurus CVCL_D199 Ishikawa 3-H-12 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 clo 0037237 293-derived cell skos:exactMatch cellosaurus CVCL_0045 HEK293 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 clo 0037261 3T3-derived cell skos:exactMatch mesh D016475 3T3 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 diff --git a/src/biomappings/resources/mappings.tsv b/src/biomappings/resources/mappings.tsv index 9bcdc2e8..29413292 100644 --- a/src/biomappings/resources/mappings.tsv +++ b/src/biomappings/resources/mappings.tsv @@ -3011,11 +3011,25 @@ cl CL:0008002 skeletal muscle fiber skos:exactMatch mesh D018485 Muscle Fibers, cl CL:0010003 epithelial cell of alveolus of lung skos:exactMatch mesh D056809 Alveolar Epithelial Cells semapv:ManualMappingCuration orcid:0000-0001-9439-5346 cl CL:0010017 zygote skos:exactMatch mesh D015053 Zygote semapv:ManualMappingCuration orcid:0000-0001-9439-5346 cl CL:0010021 cardiac myoblast skos:exactMatch mesh D032386 Myoblasts, Cardiac semapv:ManualMappingCuration orcid:0000-0001-9439-5346 +clo 0000031 cell line skos:exactMatch mesh D002460 Cell Line semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.762 clo 0001230 HEK293 skos:exactMatch cellosaurus CVCL_0045 HEK293 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 clo 0001345 3T3 cell skos:exactMatch mesh D016475 3T3 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 +clo 0001601 A549 cell skos:exactMatch mesh D000072283 A549 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 clo 0002585 COR-L23 cell skos:exactMatch efo 0002142 CORL23 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 +clo 0003704 Hep G2 cell skos:exactMatch mesh D056945 Hep G2 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0007606 MCF7 cell skos:exactMatch mesh D061986 MCF-7 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0007634 MDA-MB-231 cell skos:exactMatch efo 0001209 MDAMB231 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 +clo 0007634 MDA-MB-231 cell skos:exactMatch mesh D000092302 MDA-MB-231 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0007646 MDCK cell skos:exactMatch mesh D061985 Madin Darby Canine Kidney Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0008395 PC-3 cell skos:exactMatch mesh D000078722 PC-3 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0008753 RAW 264.7 cell skos:exactMatch mesh D000067996 RAW 264.7 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0009348 THP-1 cell skos:exactMatch mesh D000074084 THP-1 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 +clo 0036932 Hybridoma skos:exactMatch mesh D006825 Hybridomas semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.556 +clo 0036936 Somatic cell hybrid skos:exactMatch mesh D006822 Hybrid Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 clo 0037230 Ishikawa 3-H-12 cell skos:exactMatch cellosaurus CVCL_D199 Ishikawa 3-H-12 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 +clo 0037291 MDAMB231 cell skos:exactMatch mesh D000092302 MDA-MB-231 Cells semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.549 clo 0037300 BALL-1 cell skos:exactMatch cellosaurus CVCL_1075 BALL-1 semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:UnspecifiedMatching clo 0.8 +clo 0037339 tissue donor skos:exactMatch mesh D014019 Tissue Donors semapv:ManualMappingCuration orcid:0000-0003-4423-4370 semapv:LexicalMatching https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py 0.54 doid DOID:0040002 aspirin allergy skos:exactMatch umls C0004058 Allergy to aspirin semapv:ManualMappingCuration orcid:0000-0003-4423-4370 doid DOID:0040004 amoxicillin allergy skos:exactMatch umls C0571417 Allergy to amoxicillin semapv:ManualMappingCuration orcid:0000-0003-4423-4370 doid DOID:0040005 ceftriaxone allergy skos:exactMatch umls C0571463 Allergy to ceftriaxone semapv:ManualMappingCuration orcid:0000-0003-4423-4370 diff --git a/src/biomappings/resources/predictions.tsv b/src/biomappings/resources/predictions.tsv index d76914f2..c87b853f 100644 --- a/src/biomappings/resources/predictions.tsv +++ b/src/biomappings/resources/predictions.tsv @@ -9459,7 +9459,11 @@ chebi CHEBI:9954 Verbenalin skos:exactMatch mesh C000511 cornin iridoid semapv:L chebi CHEBI:9955 (R)-(+)-verbenone skos:exactMatch mesh C052875 verbenone semapv:LexicalMatching 0.95 generate_chebi_mesh_mappings.py clo 0007050 K 562 cell skos:exactMatch cellosaurus CVCL_0004 K-562 semapv:UnspecifiedMatching 0.8 clo clo 0007059 K-562 cell skos:exactMatch cellosaurus CVCL_0004 K-562 semapv:UnspecifiedMatching 0.8 clo -clo 0007634 MDA-MB-231 cell skos:exactMatch efo 0001209 MDAMB231 semapv:UnspecifiedMatching 0.8 clo +clo 0007219 L929 cell skos:exactMatch mesh D007739 L Cells semapv:LexicalMatching 0.549 https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py +clo 0007220 L-929 cell skos:exactMatch mesh D007739 L Cells semapv:LexicalMatching 0.549 https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py +clo 0008987 SF-21 cell skos:exactMatch mesh D061987 Sf9 Cells semapv:LexicalMatching 0.549 https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py +clo 0008988 SF-9 cell skos:exactMatch mesh D061987 Sf9 Cells semapv:LexicalMatching 0.549 https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py +clo 0008989 Sf9 cell skos:exactMatch mesh D061987 Sf9 Cells semapv:LexicalMatching 0.549 https://github.com/biomappings/biomappings/blob/746fde/scripts/generate_clo_mesh_mappings.py clo 0009034 SK-BR-3 cell skos:exactMatch efo 0001236 SKBR3 semapv:UnspecifiedMatching 0.8 clo clo 0009040 SK-MEL-1 cell skos:exactMatch efo 0002332 SKMEL1 semapv:UnspecifiedMatching 0.8 clo clo 0037291 MDAMB231 cell skos:exactMatch efo 0001209 MDAMB231 semapv:UnspecifiedMatching 0.8 clo diff --git a/src/biomappings/utils.py b/src/biomappings/utils.py index e69fc6a2..b529e79b 100644 --- a/src/biomappings/utils.py +++ b/src/biomappings/utils.py @@ -233,3 +233,7 @@ def get_curie(prefix: str, identifier: str) -> str: if prefix_norm is None or identifier_norm is None: raise ValueError(f"could not normalize {prefix}:{identifier}") return f"{prefix_norm}:{identifier_norm}" + + +#: A filter 3-dictionary of source prefix to target prefix to source identifier to target identifier +CMapping = Mapping[str, Mapping[str, Mapping[str, str]]]