From cc7b2f05bb5776e9467255f09594870e85dfa6db Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Sun, 31 May 2020 00:22:39 +0200 Subject: [PATCH] Add identifiers.org links when possible in resolver service (#55) --- src/pyobo/apps/resolver/resolver.py | 16 +++- src/pyobo/apps/resolver/templates/home.html | 33 ++++---- src/pyobo/identifier_utils.py | 31 ++++++-- src/pyobo/registries/__init__.py | 4 +- src/pyobo/registries/metaregistry.json | 25 ++++++ src/pyobo/registries/metaregistry.py | 7 ++ src/pyobo/registries/utils.py | 5 +- src/pyobo/xrefdb/cli.py | 15 ++-- tests/test_get_miriam_url.py | 85 +++++++++++++++++++++ 9 files changed, 189 insertions(+), 32 deletions(-) create mode 100644 tests/test_get_miriam_url.py diff --git a/src/pyobo/apps/resolver/resolver.py b/src/pyobo/apps/resolver/resolver.py index 936ad31a..b8ca6290 100644 --- a/src/pyobo/apps/resolver/resolver.py +++ b/src/pyobo/apps/resolver/resolver.py @@ -25,7 +25,7 @@ from pyobo.apps.utils import gunicorn_option, host_option, port_option, run_app from pyobo.cli_utils import verbose_option from pyobo.constants import PYOBO_HOME -from pyobo.identifier_utils import normalize_curie +from pyobo.identifier_utils import get_identifiers_org_link, normalize_curie resolve_blueprint = Blueprint('resolver', __name__) @@ -93,15 +93,24 @@ def _help_resolve(curie: str) -> Mapping[str, Any]: message='Could not identify prefix', ) + miriam = get_identifiers_org_link(prefix, identifier) + id_name_mapping = get_id_name_mapping(prefix) if id_name_mapping is None: - return dict( + rv = dict( query=curie, prefix=prefix, identifier=identifier, success=False, - message='Could not find id->name mapping for prefix', ) + if miriam: + rv.update(dict( + miriam=miriam, + message='Could not find id->name mapping for prefix, but still able to report Identifiers.org link', + )) + else: + rv['message'] = 'Could not find id->name mapping for prefix' + return rv name = id_name_mapping.get(identifier) if name is None: @@ -119,6 +128,7 @@ def _help_resolve(curie: str) -> Mapping[str, Any]: identifier=identifier, name=name, success=True, + miriam=miriam, ) diff --git a/src/pyobo/apps/resolver/templates/home.html b/src/pyobo/apps/resolver/templates/home.html index f9153efa..6ef52cd1 100644 --- a/src/pyobo/apps/resolver/templates/home.html +++ b/src/pyobo/apps/resolver/templates/home.html @@ -14,7 +14,7 @@

Ooh Na Na CURIE Resolver

This service resolves CURIEs - to their labels. + to their labels and builds Identifiers.org URLs when possible.

It has a single endpoint /resolve/<curie> that gives back JSON. @@ -30,37 +30,42 @@

Ooh Na Na CURIE Resolver

DOID:14330 - ✅ exact + 🟢 exact doid:14330 - ✅ capitalization variant + 🟢 capitalization variant DO:14330 - ✅ synonym identified in metaregistry + 🟢 synonym identified in metaregistry - DOID:00000 - ❌ invalid identifier + apo:0000155 + 🟡 able to look up name, but prefix is not listed on Identifiers.org - NNN:00000 - ❌ invalid prefix + wikidata:Q42 + 🟡 able to generate Identifiers.org link, but name unavailable - wikidata:Q42 - ❌ unmapped prefix + DOID:00000 + 🔴 valid prefix, but invalid identifier + + + NNN:00000 + 🔴 invalid prefix
-{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/src/pyobo/identifier_utils.py b/src/pyobo/identifier_utils.py index 270148fe..046765e4 100644 --- a/src/pyobo/identifier_utils.py +++ b/src/pyobo/identifier_utils.py @@ -2,18 +2,23 @@ """Utilities for handling prefixes.""" +import logging +from collections import defaultdict +from typing import Optional, Tuple, Union + +from .registries import ( + PREFIX_TO_MIRIAM_PREFIX, REMAPPINGS_PREFIX, XREF_BLACKLIST, XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, + get_miriam, get_namespace_synonyms, +) + __all__ = [ 'normalize_curie', + 'get_identifiers_org_link', 'normalize_prefix', 'normalize_dashes', ] -from collections import defaultdict -from typing import Optional, Tuple, Union - -from .registries import ( - REMAPPINGS_PREFIX, XREF_BLACKLIST, XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, get_namespace_synonyms, -) +logger = logging.getLogger(__name__) def alternate_strip_prefix(s, prefix): @@ -26,6 +31,7 @@ def alternate_strip_prefix(s, prefix): SYNONYM_TO_KEY = get_namespace_synonyms() UNHANDLED_NAMESPACES = defaultdict(list) UBERON_UNHANDLED = defaultdict(list) +MIRIAM = get_miriam(mappify=True) def normalize_prefix(prefix: str, *, curie=None, xref=None) -> Optional[str]: @@ -79,6 +85,19 @@ def normalize_curie(node: str) -> Union[Tuple[str, str], Tuple[None, None]]: return norm_node_prefix, identifier +def get_identifiers_org_link(prefix: str, identifier: str) -> Optional[str]: + """Get the identifiers.org URL if possible.""" + miriam_prefix, namespace_in_lui = PREFIX_TO_MIRIAM_PREFIX.get(prefix, (None, None)) + if not miriam_prefix and prefix in MIRIAM: + miriam_prefix = prefix + namespace_in_lui = MIRIAM[prefix]['namespaceEmbeddedInLui'] + if not miriam_prefix: + return + if namespace_in_lui: + miriam_prefix = miriam_prefix.upper() + return f'https://identifiers.org/{miriam_prefix}:{identifier}' + + # See: https://en.wikipedia.org/wiki/Dash FIGURE_DASH = b'\xe2\x80\x92'.decode('utf-8') EN_DASH = b'\xe2\x80\x93'.decode('utf-8') diff --git a/src/pyobo/registries/__init__.py b/src/pyobo/registries/__init__.py index 38917100..905d4999 100644 --- a/src/pyobo/registries/__init__.py +++ b/src/pyobo/registries/__init__.py @@ -3,8 +3,8 @@ """Extract registry information.""" from .metaregistry import ( # noqa: F401 - CURATED_REGISTRY, CURATED_URLS, NOT_AVAILABLE_AS_OBO, OBSOLETE, REMAPPINGS_PREFIX, XREF_BLACKLIST, - XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, get_curated_registry, + CURATED_REGISTRY, CURATED_URLS, NOT_AVAILABLE_AS_OBO, OBSOLETE, PREFIX_TO_MIRIAM_PREFIX, REMAPPINGS_PREFIX, + XREF_BLACKLIST, XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, get_curated_registry, ) from .miriam import get_miriam # noqa: F401 from .obofoundry import get_obofoundry # noqa: F401 diff --git a/src/pyobo/registries/metaregistry.json b/src/pyobo/registries/metaregistry.json index 3d731c1b..d9415575 100644 --- a/src/pyobo/registries/metaregistry.json +++ b/src/pyobo/registries/metaregistry.json @@ -207,6 +207,7 @@ "doid": { "miriam": { "id": "00000233", + "namespaceEmbeddedInLui": true, "prefix": "doid" }, "obofoundry": { @@ -234,6 +235,11 @@ "not_available_as_obo": true }, "eccode": { + "miriam": { + "id": "00000004", + "namespaceEmbeddedInLui": false, + "prefix": "ec-code" + }, "synonyms": [ "EC", "EC-CODE", @@ -259,6 +265,7 @@ "download": "http://www.ebi.ac.uk/efo/efo.obo", "miriam": { "id": "00000391", + "namespaceEmbeddedInLui": false, "prefix": "efo" }, "obofoundry": { @@ -318,6 +325,11 @@ "pattern": "\\d{7}" }, "flybase": { + "miriam": { + "id": "00000030", + "namespaceEmbeddedInLui": false, + "prefix": "fb" + }, "synonyms": [ "FB", "FlyBase" @@ -347,6 +359,11 @@ }, "go": { "download": "http://purl.obolibrary.org/obo/go.obo", + "miriam": { + "id": "00000022", + "namespaceEmbeddedInLui": true, + "prefix": "go" + }, "synonyms": [ "gobp", "gocc", @@ -377,6 +394,7 @@ "download": "http://purl.obolibrary.org/obo/hp.obo", "miriam": { "id": "00000571", + "namespaceEmbeddedInLui": true, "prefix": "hp" }, "obofoundry": { @@ -392,6 +410,7 @@ "icd10": { "miriam": { "id": "00000009", + "namespaceEmbeddedInLui": false, "prefix": "icd" }, "synonyms": [ @@ -504,6 +523,7 @@ "mesh": { "miriam": { "id": "00000560", + "namespaceEmbeddedInLui": false, "prefix": "mesh" }, "synonyms": [ @@ -596,6 +616,7 @@ "download": "http://purl.obolibrary.org/obo/ncbitaxon.obo", "miriam": { "id": "00000006", + "namespaceEmbeddedInLui": false, "prefix": "taxonomy" }, "pattern": "\\d+", @@ -611,6 +632,7 @@ "ncit": { "miriam": { "id": "00000139", + "namespaceEmbeddedInLui": false, "prefix": "ncit" }, "synonyms": [ @@ -649,6 +671,7 @@ "omit": { "miriam": { "id": "00000605", + "namespaceEmbeddedInLui": false, "prefix": "omit" }, "synonyms": [ @@ -825,6 +848,7 @@ "snomedct": { "miriam": { "id": "00000269", + "namespaceEmbeddedInLui": false, "prefix": "snomedct" }, "synonyms": [ @@ -921,6 +945,7 @@ "umls": { "miriam": { "id": "00000233", + "namespaceEmbeddedInLui": false, "prefix": "umls" }, "synonyms": [ diff --git a/src/pyobo/registries/metaregistry.py b/src/pyobo/registries/metaregistry.py index a7464d23..31cb6cc9 100644 --- a/src/pyobo/registries/metaregistry.py +++ b/src/pyobo/registries/metaregistry.py @@ -4,6 +4,7 @@ import json import os +from typing import Mapping, Tuple __all__ = [ 'CURATED_REGISTRY_PATH', @@ -56,3 +57,9 @@ def get_curated_registry(): REMAPPINGS_FULL = CURATED_REGISTRY['remappings']['full'] #: Remappings for xrefs based on the prefix. Doesn't take into account the semicolon : REMAPPINGS_PREFIX = CURATED_REGISTRY['remappings']['prefix'] + +PREFIX_TO_MIRIAM_PREFIX: Mapping[str, Tuple[str, str]] = { + prefix: (entry['miriam']['prefix'], entry['miriam']['namespaceEmbeddedInLui']) + for prefix, entry in CURATED_REGISTRY_DATABASE.items() + if 'miriam' in entry +} diff --git a/src/pyobo/registries/utils.py b/src/pyobo/registries/utils.py index 8247bc67..2b927237 100644 --- a/src/pyobo/registries/utils.py +++ b/src/pyobo/registries/utils.py @@ -33,7 +33,10 @@ def ensure_registry( """Download the registry (works for MIRIAM and OLS) if it doesn't already exist.""" if not force_download and cache_path is not None and os.path.exists(cache_path): with open(cache_path) as file: - return json.load(file) + rv = json.load(file) + if mappify: + rv = list_to_map(rv, id_key) + return rv rv = _download_paginated(url, embedded_key=embedded_key) rv = sorted(rv, key=itemgetter(id_key)) diff --git a/src/pyobo/xrefdb/cli.py b/src/pyobo/xrefdb/cli.py index ce8b7a29..c10c0dce 100644 --- a/src/pyobo/xrefdb/cli.py +++ b/src/pyobo/xrefdb/cli.py @@ -11,12 +11,13 @@ from .xrefs_pipeline import Canonicalizer, _iter_ooh_na_na, get_xref_df, summarize_xref_df from ..cli_utils import verbose_option +from ..constants import PYOBO_HOME from ..identifier_utils import UNHANDLED_NAMESPACES directory_option = click.option( '-d', '--directory', type=click.Path(dir_okay=True, file_okay=False, exists=True), - default=os.getcwd(), + default=PYOBO_HOME, ) @@ -44,7 +45,7 @@ def _write_tsv(df: pd.DataFrame, name: str) -> None: # Export a summary dataframe summary_df = summarize_xref_df(xrefs_df) - _write_tsv(summary_df, 'inspector_javerts_xref_summary.tsv') + _write_tsv(summary_df, 'inspector_javerts_xrefs_summary.tsv') # Export the namespaces that haven't been handled yet unmapped_path = os.path.join(directory, 'inspector_javerts_unmapped_xrefs.tsv') @@ -61,15 +62,17 @@ def ooh_na_na(directory: str): """Make the prefix-identifier-name dump.""" c = Counter() - path = os.path.join(directory, 'ooh_na_na.tsv.gz') - with gzip.open(path, mode='wt') as gzipped_file: + db_path = os.path.join(directory, 'ooh_na_na.tsv.gz') + click.echo(f'Writing Ooh-Na-Na to {db_path}') + with gzip.open(db_path, mode='wt') as gzipped_file: print('prefix', 'identifier', 'name', sep='\t', file=gzipped_file) for prefix, identifier, name in _iter_ooh_na_na(): c[prefix] += 1 print(prefix, identifier, name, sep='\t', file=gzipped_file) - path = os.path.join(directory, 'summary.tsv') - with open(path, 'w') as file: + summary_path = os.path.join(directory, 'ooh_na_na_summary.tsv') + click.echo(f'Writing Ooh-Na-Na summary to {summary_path}') + with open(summary_path, 'w') as file: for k, v in c.most_common(): print(k, v, sep='\t', file=file) diff --git a/tests/test_get_miriam_url.py b/tests/test_get_miriam_url.py new file mode 100644 index 00000000..8df1c257 --- /dev/null +++ b/tests/test_get_miriam_url.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +"""Tests for identifiers.org URL generation.""" + +import logging +import unittest + +import requests + +from pyobo.identifier_utils import get_identifiers_org_link, normalize_prefix +from pyobo.registries.miriam import get_miriam + +logger = logging.getLogger(__name__) + +#: These resources don't seem to exist anymore +BLACKLIST = { + 'abs', + 'aftol.taxonomy', + 'agricola', + 'ecogene', + 'euclinicaltrials', + 'fsnp', + 'gold', + 'gold.genome', + 'gold.meta', +} + +#: These resources will need special rules for resolving +UNSOLVED = { + 'ark', + 'did', + 'gramene.growthstage', + 'gwascentral.phenotype', + # TODO +} + + +class TestMiriam(unittest.TestCase): + """Test generating identifiers.org links.""" + + def test_successful(self): + """Test CURIEs that should work.""" + curies = [ + ('go', '0006915'), # name in LUI + ('doid', '11337'), # name in LUI + ('mesh', 'C000100'), # namespace not in LUI + ] + + # curies = [] + # for entry in get_miriam(): + # prefix = entry['prefix'] + # if prefix <= 'gramene.growthstage': # TODO REMOVE THIS LINE + # continue # TODO REMOVE THIS LINE + # norm_prefix = normalize_prefix(prefix) + # self.assertIsNotNone(norm_prefix, msg=f'could not normalize MIRIAM prefix: {norm_prefix}') + # curies.append((prefix, norm_prefix, entry['sampleId'])) + + for prefix, identifier in curies: + if prefix in BLACKLIST or prefix in UNSOLVED: + continue + with self.subTest(prefix=prefix, msg=f'failed for MIRIAM prefix: {prefix}'): + url = get_identifiers_org_link(prefix, identifier) + self.assertIsNotNone(url, msg=f'metaregistry does not contain prefix {prefix}') + try: + res = requests.get(url) + except ( + requests.exceptions.SSLError, + requests.exceptions.ConnectionError, + ): + logger.warning(f'identifiers.org has a problem resolving prefix {prefix}') + continue + self.assertFalse( + res.text.startswith('INVALID'), + msg=f'invalid url for {prefix}: {url}\n\n{res.text}', + ) + + def test_unsuccessful(self): + """Test links that should fail.""" + curies = [ + ('nope_nope_nope', '0006915'), + ] + for prefix, identifier in curies: + with self.subTest(prefix=prefix): + url = get_identifiers_org_link(prefix, identifier) + self.assertIsNone(url)