Skip to content

Commit

Permalink
Various cleanup for large scale database generation (#160)
Browse files Browse the repository at this point in the history
* Updates for semra build

* Add high-level getter functions

* Remove deprecated code

* Fix bad line handling

* Generalize exception handling

* Add back missing exception

* Update struct.py

* miRBase is broken.

* Add kwargs to from_obo_path

* Enable strict passing to relation curie parsing

* Update setup.cfg

* Update relations.py
  • Loading branch information
cthoyt authored Sep 26, 2023
1 parent 291aeb4 commit db05ef4
Show file tree
Hide file tree
Showing 15 changed files with 92 additions and 71 deletions.
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ ignore =
# Pickle stuff
S301
S403
# too complicated
C901
exclude =
.tox,
.git,
Expand Down
2 changes: 2 additions & 0 deletions src/pyobo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
from .api import ( # noqa: F401
get_alts_to_id,
get_ancestors,
get_children,
get_definition,
get_descendants,
get_filtered_properties_df,
get_filtered_properties_mapping,
get_filtered_properties_multimapping,
get_filtered_relations_df,
get_filtered_xrefs,
get_graph,
get_hierarchy,
get_id_definition_mapping,
get_id_multirelations_mapping,
Expand Down
2 changes: 2 additions & 0 deletions src/pyobo/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
)
from .hierarchy import ( # noqa: F401
get_ancestors,
get_children,
get_descendants,
get_hierarchy,
get_subhierarchy,
Expand Down Expand Up @@ -38,6 +39,7 @@
)
from .relations import ( # noqa: F401
get_filtered_relations_df,
get_graph,
get_id_multirelations_mapping,
get_relation,
get_relation_mapping,
Expand Down
26 changes: 26 additions & 0 deletions src/pyobo/api/hierarchy.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"get_ancestors",
"has_ancestor",
"is_descendent",
"get_children",
]

from ..struct.reference import Reference
Expand Down Expand Up @@ -189,6 +190,31 @@ def get_descendants(
return nx.ancestors(hierarchy, curie) # note this is backwards


@lru_cache()
def get_children(
prefix: str,
identifier: str,
include_part_of: bool = True,
include_has_member: bool = False,
use_tqdm: bool = False,
force: bool = False,
**kwargs,
) -> Optional[Set[str]]:
"""Get all of the descendants (children) of the term as CURIEs."""
hierarchy = get_hierarchy(
prefix=prefix,
include_has_member=include_has_member,
include_part_of=include_part_of,
use_tqdm=use_tqdm,
force=force,
**kwargs,
)
curie = f"{prefix}:{identifier}"
if curie not in hierarchy:
return None
return set(hierarchy.predecessors(curie))


def has_ancestor(prefix, identifier, ancestor_prefix, ancestor_identifier) -> bool:
"""Check that the first identifier has the second as an ancestor.
Expand Down
8 changes: 5 additions & 3 deletions src/pyobo/api/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import logging
import subprocess
import zipfile
from functools import lru_cache
from typing import Callable, List, Mapping, Optional, Set, TypeVar

Expand Down Expand Up @@ -59,6 +58,9 @@ def _help_get(
logger.warning("[%s] unable to look up results with %s", prefix, f)
NO_BUILD_PREFIXES.add(prefix)
return None
except ValueError:
logger.warning("[%s] unable to look up results with %s", prefix, f)
return None

if not mapping:
if prefix not in NO_BUILD_PREFIXES:
Expand Down Expand Up @@ -137,8 +139,8 @@ def _get_id_name_mapping() -> Mapping[str, str]:

try:
return _get_id_name_mapping()
except (zipfile.BadZipFile, subprocess.CalledProcessError):
logger.exception("[%s v%s] could not load", prefix, version)
except (Exception, subprocess.CalledProcessError) as e:
logger.exception("[%s v%s] could not load: %s", prefix, version, e)
return {}


Expand Down
25 changes: 24 additions & 1 deletion src/pyobo/api/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from functools import lru_cache
from typing import List, Mapping, Optional

import networkx as nx
import pandas as pd

from .utils import get_version
Expand All @@ -25,6 +26,15 @@
from ..utils.cache import cached_df
from ..utils.path import prefix_cache_join

__all__ = [
"get_relations_df",
"get_filtered_relations_df",
"get_id_multirelations_mapping",
"get_relation_mapping",
"get_relation",
"get_graph",
]

# TODO get_relation, get_relations

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -71,7 +81,7 @@ def get_filtered_relations_df(
force: bool = False,
version: Optional[str] = None,
) -> pd.DataFrame:
"""Get all of the given relation."""
"""Get all the given relation."""
relation_prefix, relation_identifier = relation = get_reference_tuple(relation)
if version is None:
version = get_version(prefix)
Expand Down Expand Up @@ -173,3 +183,16 @@ def get_relation(
force=force,
)
return relation_mapping.get(source_identifier)


def get_graph(prefix: str, **kwargs) -> nx.DiGraph:
"""Get the relation graph."""
rv = nx.MultiDiGraph()
df = get_relations_df(prefix=prefix, **kwargs)
for source_id, relation_prefix, relation_id, target_ns, target_id in df.values:
rv.add_edge(
f"{prefix}:{source_id}",
f"{target_ns}:{target_id}",
key=f"{relation_prefix}:{relation_id}",
)
return rv
3 changes: 1 addition & 2 deletions src/pyobo/getters.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
)

import bioregistry
from bioontologies import robot
from tqdm.auto import tqdm

from .constants import DATABASE_DIRECTORY
Expand Down Expand Up @@ -120,8 +121,6 @@ def get_ontology(
elif ontology_format == "obo":
pass # all gucci
elif ontology_format == "owl":
from bioontologies import robot

_converted_obo_path = path.with_suffix(".obo")
if prefix in REQUIRES_NO_ROBOT_CHECK:
robot_check = False
Expand Down
6 changes: 3 additions & 3 deletions src/pyobo/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@


def from_obo_path(
path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True
path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True, **kwargs
) -> Obo:
"""Get the OBO graph from a path."""
import obonet
Expand All @@ -72,7 +72,7 @@ def from_obo_path(
_clean_graph_ontology(graph, prefix)

# Convert to an Obo instance and return
return from_obonet(graph, strict=strict)
return from_obonet(graph, strict=strict, **kwargs)


def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> "Obo": # noqa:C901
Expand Down Expand Up @@ -574,7 +574,7 @@ def iterate_node_relationships(
if relation_curie in RELATION_REMAPPINGS:
relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
else:
relation_prefix, relation_identifier = normalize_curie(relation_curie)
relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict)
if relation_prefix is not None and relation_identifier is not None:
relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
elif prefix is not None:
Expand Down
9 changes: 3 additions & 6 deletions src/pyobo/sources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@
from .kegg import KEGGGeneGetter, KEGGGenomeGetter, KEGGPathwayGetter
from .mesh import MeSHGetter
from .mgi import MGIGetter
from .mirbase import MiRBaseGetter
from .mirbase_family import MiRBaseFamilyGetter
from .mirbase_mature import MiRBaseMatureGetter
from .msigdb import MSigDBGetter
from .ncbigene import NCBIGeneGetter
from .npass import NPASSGetter
Expand Down Expand Up @@ -86,9 +83,9 @@
"MGIGetter",
"MSigDBGetter",
"MeSHGetter",
"MiRBaseFamilyGetter",
"MiRBaseGetter",
"MiRBaseMatureGetter",
# "MiRBaseFamilyGetter",
# "MiRBaseGetter",
# "MiRBaseMatureGetter",
"NCBIGeneGetter",
"NPASSGetter",
"PIDGetter",
Expand Down
14 changes: 9 additions & 5 deletions src/pyobo/sources/drugbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pystow
from tqdm.auto import tqdm

from ..getters import NoBuild
from ..struct import Obo, Reference, Term
from ..struct.typedef import has_salt
from ..utils.cache import cached_pickle
Expand Down Expand Up @@ -145,12 +146,15 @@ def get_xml_root(version: Optional[str] = None) -> ElementTree.Element:
Takes between 35-60 seconds.
"""
from drugbank_downloader import parse_drugbank
from pystow.config_api import ConfigError

element = parse_drugbank(
version=version,
username=pystow.get_config("pyobo", "drugbank_username"),
password=pystow.get_config("pyobo", "drugbank_password"),
)
try:
username = pystow.get_config("pyobo", "drugbank_username", raise_on_missing=True)
password = pystow.get_config("pyobo", "drugbank_password", raise_on_missing=True)
except ConfigError as e:
raise NoBuild from e

element = parse_drugbank(version=version, username=username, password=password)
return element.getroot()


Expand Down
13 changes: 9 additions & 4 deletions src/pyobo/sources/icd_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@
import pystow
import requests
from cachier import cachier
from pystow.config_api import ConfigError
from tqdm.auto import tqdm

from ..getters import NoBuild
from ..struct import Term

ICD_CLIENT_ID = pystow.get_config("pyobo", "icd_client_id")
ICD_CLIENT_SECRET = pystow.get_config("pyobo", "icd_client_secret")

TOKEN_URL = "https://icdaccessmanagement.who.int/connect/token" # noqa:S105

ICD_BASE_URL = "https://id.who.int/icd"
Expand Down Expand Up @@ -52,10 +51,16 @@ def get_child_identifiers(endpoint: str, res_json: Mapping[str, Any]) -> List[st
@cachier(stale_after=datetime.timedelta(minutes=45))
def get_icd_api_headers() -> Mapping[str, str]:
"""Get the headers, and refresh every hour."""
try:
icd_client_id = pystow.get_config("pyobo", "icd_client_id", raise_on_missing=True)
icd_client_secret = pystow.get_config("pyobo", "icd_client_secret", raise_on_missing=True)
except ConfigError as e:
raise NoBuild from e

grant_type = "client_credentials"
body_params = {"grant_type": grant_type}
tqdm.write("getting ICD API token")
res = requests.post(TOKEN_URL, data=body_params, auth=(ICD_CLIENT_ID, ICD_CLIENT_SECRET))
res = requests.post(TOKEN_URL, data=body_params, auth=(icd_client_id, icd_client_secret))
res_json = res.json()
access_type = res_json["token_type"]
access_token = res_json["access_token"]
Expand Down
3 changes: 2 additions & 1 deletion src/pyobo/sources/pid.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from typing import Iterable, List, Mapping, Tuple

import pandas as pd
from protmapper.uniprot_client import get_gene_name, get_hgnc_id

from ..api import get_id_name_mapping
from ..struct import Obo, Reference, Term
Expand Down Expand Up @@ -55,6 +54,8 @@ def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[Tuple

def iter_terms(force: bool = False) -> Iterable[Term]:
"""Iterate over NCI PID terms."""
from protmapper.uniprot_client import get_gene_name, get_hgnc_id

hgnc_id_to_name = get_id_name_mapping("hgnc")
hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()}

Expand Down
2 changes: 1 addition & 1 deletion src/pyobo/sources/rgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Te
force=force,
version=version,
quoting=3,
error_bad_lines=False,
on_bad_lines="skip",
)
for _, row in tqdm(
df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True
Expand Down
4 changes: 3 additions & 1 deletion src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,10 @@ def extend_relationship(self, typedef: TypeDef, references: Iterable[Reference])
raise ValueError("can not extend a collection that includes a null reference")
self.relationships[typedef].extend(references)

def append_property(self, prop: str, value: str) -> None:
def append_property(self, prop: Union[str, TypeDef], value: str) -> None:
"""Append a property."""
if isinstance(prop, TypeDef):
prop = prop.curie
self.properties[prop].append(value)

def _definition_fp(self) -> str:
Expand Down
44 changes: 0 additions & 44 deletions src/pyobo/xrefdb/bengo.py

This file was deleted.

0 comments on commit db05ef4

Please sign in to comment.