Skip to content

Commit

Permalink
Add identifiers.org links when possible in resolver service (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt authored May 30, 2020
1 parent 16487e4 commit cc7b2f0
Show file tree
Hide file tree
Showing 9 changed files with 189 additions and 32 deletions.
16 changes: 13 additions & 3 deletions src/pyobo/apps/resolver/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from pyobo.apps.utils import gunicorn_option, host_option, port_option, run_app
from pyobo.cli_utils import verbose_option
from pyobo.constants import PYOBO_HOME
from pyobo.identifier_utils import normalize_curie
from pyobo.identifier_utils import get_identifiers_org_link, normalize_curie

resolve_blueprint = Blueprint('resolver', __name__)

Expand Down Expand Up @@ -93,15 +93,24 @@ def _help_resolve(curie: str) -> Mapping[str, Any]:
message='Could not identify prefix',
)

miriam = get_identifiers_org_link(prefix, identifier)

id_name_mapping = get_id_name_mapping(prefix)
if id_name_mapping is None:
return dict(
rv = dict(
query=curie,
prefix=prefix,
identifier=identifier,
success=False,
message='Could not find id->name mapping for prefix',
)
if miriam:
rv.update(dict(
miriam=miriam,
message='Could not find id->name mapping for prefix, but still able to report Identifiers.org link',
))
else:
rv['message'] = 'Could not find id->name mapping for prefix'
return rv

name = id_name_mapping.get(identifier)
if name is None:
Expand All @@ -119,6 +128,7 @@ def _help_resolve(curie: str) -> Mapping[str, Any]:
identifier=identifier,
name=name,
success=True,
miriam=miriam,
)


Expand Down
33 changes: 19 additions & 14 deletions src/pyobo/apps/resolver/templates/home.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ <h2>Ooh Na Na CURIE Resolver</h2>
<div class="panel-body">
<p>
This service resolves <a href="https://en.wikipedia.org/wiki/CURIE">CURIEs</a>
to their labels.
to their labels and builds Identifiers.org URLs when possible.
</p>
<p>
It has a single endpoint <code>/resolve/&lt;curie&gt;</code> that gives back JSON.
Expand All @@ -30,37 +30,42 @@ <h2>Ooh Na Na CURIE Resolver</h2>
</thead>
<tr>
<td><a href="{{ url_for('.resolve', curie='DOID:14330' ) }}">DOID:14330</a></td>
<td> exact</td>
<td>🟢 exact</td>
</tr>
<tr>
<td><a href="{{ url_for('.resolve', curie='doid:14330' ) }}">doid:14330</a></td>
<td> capitalization variant</td>
<td>🟢 capitalization variant</td>
</tr>
<tr>
<td><a href="{{ url_for('.resolve', curie='DO:14330' ) }}">DO:14330</a></td>
<td> synonym identified in metaregistry</td>
<td>🟢 synonym identified in metaregistry</td>
</tr>
<tr>
<td><a href="{{ url_for('.resolve', curie='DOID:00000' ) }}">DOID:00000</a></td>
<td>❌ invalid identifier</td>
<td><a href="{{ url_for('.resolve', curie='APO:0000155' ) }}">apo:0000155</a></td>
<td>🟡 able to look up name, but prefix is not listed on Identifiers.org</td>
</tr>
<tr>
<td><a href="{{ url_for('.resolve', curie='NNN:00000' ) }}">NNN:00000</a></td>
<td>❌ invalid prefix</td>
<td><a href="{{ url_for('.resolve', curie='wikidata:Q42' ) }}">wikidata:Q42</a></td>
<td>🟡 able to generate Identifiers.org link, but name unavailable</td>
</tr>
<tr>
<td><a href="{{ url_for('.resolve', curie='wikidata:Q42' ) }}">wikidata:Q42</a></td>
<td>❌ unmapped prefix</td>
<td><a href="{{ url_for('.resolve', curie='DOID:00000' ) }}">DOID:00000</a></td>
<td>🔴 valid prefix, but invalid identifier</td>
</tr>
<tr>
<td><a href="{{ url_for('.resolve', curie='NNN:00000' ) }}">NNN:00000</a></td>
<td>🔴 invalid prefix</td>
</tr>
</table>
<div class="panel-footer">
<p>
This service is implemented in <a href="https://github.com/pyobo/pyobo">PyOBO</a>.
If you want to know more about how it was built, check this
<a href="https://cthoyt.com/2020/04/18/ooh-na-na.html">blog post</a>.
This service is implemented in <a href="https://github.com/pyobo/pyobo">PyOBO</a> and graciously
hosted by <a href="https://envedatherapeutics.com/">Enveda Therapeutics</a>. If you want to know
more about how it was built, check this <a href="https://cthoyt.com/2020/04/18/ooh-na-na.html">blog
post</a>.
</p>
</div>
</div>
</div>
</div>
{% endblock %}
{% endblock %}
31 changes: 25 additions & 6 deletions src/pyobo/identifier_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,23 @@

"""Utilities for handling prefixes."""

import logging
from collections import defaultdict
from typing import Optional, Tuple, Union

from .registries import (
PREFIX_TO_MIRIAM_PREFIX, REMAPPINGS_PREFIX, XREF_BLACKLIST, XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST,
get_miriam, get_namespace_synonyms,
)

__all__ = [
'normalize_curie',
'get_identifiers_org_link',
'normalize_prefix',
'normalize_dashes',
]

from collections import defaultdict
from typing import Optional, Tuple, Union

from .registries import (
REMAPPINGS_PREFIX, XREF_BLACKLIST, XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, get_namespace_synonyms,
)
logger = logging.getLogger(__name__)


def alternate_strip_prefix(s, prefix):
Expand All @@ -26,6 +31,7 @@ def alternate_strip_prefix(s, prefix):
SYNONYM_TO_KEY = get_namespace_synonyms()
UNHANDLED_NAMESPACES = defaultdict(list)
UBERON_UNHANDLED = defaultdict(list)
MIRIAM = get_miriam(mappify=True)


def normalize_prefix(prefix: str, *, curie=None, xref=None) -> Optional[str]:
Expand Down Expand Up @@ -79,6 +85,19 @@ def normalize_curie(node: str) -> Union[Tuple[str, str], Tuple[None, None]]:
return norm_node_prefix, identifier


def get_identifiers_org_link(prefix: str, identifier: str) -> Optional[str]:
"""Get the identifiers.org URL if possible."""
miriam_prefix, namespace_in_lui = PREFIX_TO_MIRIAM_PREFIX.get(prefix, (None, None))
if not miriam_prefix and prefix in MIRIAM:
miriam_prefix = prefix
namespace_in_lui = MIRIAM[prefix]['namespaceEmbeddedInLui']
if not miriam_prefix:
return
if namespace_in_lui:
miriam_prefix = miriam_prefix.upper()
return f'https://identifiers.org/{miriam_prefix}:{identifier}'


# See: https://en.wikipedia.org/wiki/Dash
FIGURE_DASH = b'\xe2\x80\x92'.decode('utf-8')
EN_DASH = b'\xe2\x80\x93'.decode('utf-8')
Expand Down
4 changes: 2 additions & 2 deletions src/pyobo/registries/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"""Extract registry information."""

from .metaregistry import ( # noqa: F401
CURATED_REGISTRY, CURATED_URLS, NOT_AVAILABLE_AS_OBO, OBSOLETE, REMAPPINGS_PREFIX, XREF_BLACKLIST,
XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, get_curated_registry,
CURATED_REGISTRY, CURATED_URLS, NOT_AVAILABLE_AS_OBO, OBSOLETE, PREFIX_TO_MIRIAM_PREFIX, REMAPPINGS_PREFIX,
XREF_BLACKLIST, XREF_PREFIX_BLACKLIST, XREF_SUFFIX_BLACKLIST, get_curated_registry,
)
from .miriam import get_miriam # noqa: F401
from .obofoundry import get_obofoundry # noqa: F401
Expand Down
25 changes: 25 additions & 0 deletions src/pyobo/registries/metaregistry.json
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@
"doid": {
"miriam": {
"id": "00000233",
"namespaceEmbeddedInLui": true,
"prefix": "doid"
},
"obofoundry": {
Expand Down Expand Up @@ -234,6 +235,11 @@
"not_available_as_obo": true
},
"eccode": {
"miriam": {
"id": "00000004",
"namespaceEmbeddedInLui": false,
"prefix": "ec-code"
},
"synonyms": [
"EC",
"EC-CODE",
Expand All @@ -259,6 +265,7 @@
"download": "http://www.ebi.ac.uk/efo/efo.obo",
"miriam": {
"id": "00000391",
"namespaceEmbeddedInLui": false,
"prefix": "efo"
},
"obofoundry": {
Expand Down Expand Up @@ -318,6 +325,11 @@
"pattern": "\\d{7}"
},
"flybase": {
"miriam": {
"id": "00000030",
"namespaceEmbeddedInLui": false,
"prefix": "fb"
},
"synonyms": [
"FB",
"FlyBase"
Expand Down Expand Up @@ -347,6 +359,11 @@
},
"go": {
"download": "http://purl.obolibrary.org/obo/go.obo",
"miriam": {
"id": "00000022",
"namespaceEmbeddedInLui": true,
"prefix": "go"
},
"synonyms": [
"gobp",
"gocc",
Expand Down Expand Up @@ -377,6 +394,7 @@
"download": "http://purl.obolibrary.org/obo/hp.obo",
"miriam": {
"id": "00000571",
"namespaceEmbeddedInLui": true,
"prefix": "hp"
},
"obofoundry": {
Expand All @@ -392,6 +410,7 @@
"icd10": {
"miriam": {
"id": "00000009",
"namespaceEmbeddedInLui": false,
"prefix": "icd"
},
"synonyms": [
Expand Down Expand Up @@ -504,6 +523,7 @@
"mesh": {
"miriam": {
"id": "00000560",
"namespaceEmbeddedInLui": false,
"prefix": "mesh"
},
"synonyms": [
Expand Down Expand Up @@ -596,6 +616,7 @@
"download": "http://purl.obolibrary.org/obo/ncbitaxon.obo",
"miriam": {
"id": "00000006",
"namespaceEmbeddedInLui": false,
"prefix": "taxonomy"
},
"pattern": "\\d+",
Expand All @@ -611,6 +632,7 @@
"ncit": {
"miriam": {
"id": "00000139",
"namespaceEmbeddedInLui": false,
"prefix": "ncit"
},
"synonyms": [
Expand Down Expand Up @@ -649,6 +671,7 @@
"omit": {
"miriam": {
"id": "00000605",
"namespaceEmbeddedInLui": false,
"prefix": "omit"
},
"synonyms": [
Expand Down Expand Up @@ -825,6 +848,7 @@
"snomedct": {
"miriam": {
"id": "00000269",
"namespaceEmbeddedInLui": false,
"prefix": "snomedct"
},
"synonyms": [
Expand Down Expand Up @@ -921,6 +945,7 @@
"umls": {
"miriam": {
"id": "00000233",
"namespaceEmbeddedInLui": false,
"prefix": "umls"
},
"synonyms": [
Expand Down
7 changes: 7 additions & 0 deletions src/pyobo/registries/metaregistry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import json
import os
from typing import Mapping, Tuple

__all__ = [
'CURATED_REGISTRY_PATH',
Expand Down Expand Up @@ -56,3 +57,9 @@ def get_curated_registry():
REMAPPINGS_FULL = CURATED_REGISTRY['remappings']['full']
#: Remappings for xrefs based on the prefix. Doesn't take into account the semicolon :
REMAPPINGS_PREFIX = CURATED_REGISTRY['remappings']['prefix']

PREFIX_TO_MIRIAM_PREFIX: Mapping[str, Tuple[str, str]] = {
prefix: (entry['miriam']['prefix'], entry['miriam']['namespaceEmbeddedInLui'])
for prefix, entry in CURATED_REGISTRY_DATABASE.items()
if 'miriam' in entry
}
5 changes: 4 additions & 1 deletion src/pyobo/registries/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def ensure_registry(
"""Download the registry (works for MIRIAM and OLS) if it doesn't already exist."""
if not force_download and cache_path is not None and os.path.exists(cache_path):
with open(cache_path) as file:
return json.load(file)
rv = json.load(file)
if mappify:
rv = list_to_map(rv, id_key)
return rv

rv = _download_paginated(url, embedded_key=embedded_key)
rv = sorted(rv, key=itemgetter(id_key))
Expand Down
15 changes: 9 additions & 6 deletions src/pyobo/xrefdb/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@

from .xrefs_pipeline import Canonicalizer, _iter_ooh_na_na, get_xref_df, summarize_xref_df
from ..cli_utils import verbose_option
from ..constants import PYOBO_HOME
from ..identifier_utils import UNHANDLED_NAMESPACES

directory_option = click.option(
'-d', '--directory',
type=click.Path(dir_okay=True, file_okay=False, exists=True),
default=os.getcwd(),
default=PYOBO_HOME,
)


Expand Down Expand Up @@ -44,7 +45,7 @@ def _write_tsv(df: pd.DataFrame, name: str) -> None:

# Export a summary dataframe
summary_df = summarize_xref_df(xrefs_df)
_write_tsv(summary_df, 'inspector_javerts_xref_summary.tsv')
_write_tsv(summary_df, 'inspector_javerts_xrefs_summary.tsv')

# Export the namespaces that haven't been handled yet
unmapped_path = os.path.join(directory, 'inspector_javerts_unmapped_xrefs.tsv')
Expand All @@ -61,15 +62,17 @@ def ooh_na_na(directory: str):
"""Make the prefix-identifier-name dump."""
c = Counter()

path = os.path.join(directory, 'ooh_na_na.tsv.gz')
with gzip.open(path, mode='wt') as gzipped_file:
db_path = os.path.join(directory, 'ooh_na_na.tsv.gz')
click.echo(f'Writing Ooh-Na-Na to {db_path}')
with gzip.open(db_path, mode='wt') as gzipped_file:
print('prefix', 'identifier', 'name', sep='\t', file=gzipped_file)
for prefix, identifier, name in _iter_ooh_na_na():
c[prefix] += 1
print(prefix, identifier, name, sep='\t', file=gzipped_file)

path = os.path.join(directory, 'summary.tsv')
with open(path, 'w') as file:
summary_path = os.path.join(directory, 'ooh_na_na_summary.tsv')
click.echo(f'Writing Ooh-Na-Na summary to {summary_path}')
with open(summary_path, 'w') as file:
for k, v in c.most_common():
print(k, v, sep='\t', file=file)

Expand Down
Loading

0 comments on commit cc7b2f0

Please sign in to comment.