From db05ef4c465732afbbbb89436d5f24e806d294c1 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 26 Sep 2023 14:50:08 +0200 Subject: [PATCH] Various cleanup for large scale database generation (#160) * Updates for semra build * Add high-level getter functions * Remove deprecated code * Fix bad line handling * Generalize exception handling * Add back missing exception * Update struct.py * miRBase is broken. * Add kwargs to from_obo_path * Enable strict passing to relation curie parsing * Update setup.cfg * Update relations.py --- setup.cfg | 2 ++ src/pyobo/__init__.py | 2 ++ src/pyobo/api/__init__.py | 2 ++ src/pyobo/api/hierarchy.py | 26 ++++++++++++++++++++ src/pyobo/api/names.py | 8 ++++--- src/pyobo/api/relations.py | 25 ++++++++++++++++++- src/pyobo/getters.py | 3 +-- src/pyobo/reader.py | 6 ++--- src/pyobo/sources/__init__.py | 9 +++---- src/pyobo/sources/drugbank.py | 14 +++++++---- src/pyobo/sources/icd_utils.py | 13 ++++++---- src/pyobo/sources/pid.py | 3 ++- src/pyobo/sources/rgd.py | 2 +- src/pyobo/struct/struct.py | 4 +++- src/pyobo/xrefdb/bengo.py | 44 ---------------------------------- 15 files changed, 92 insertions(+), 71 deletions(-) delete mode 100644 src/pyobo/xrefdb/bengo.py diff --git a/setup.cfg b/setup.cfg index 45672a0f..fd81ee9c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -173,6 +173,8 @@ ignore = # Pickle stuff S301 S403 + # too complicated + C901 exclude = .tox, .git, diff --git a/src/pyobo/__init__.py b/src/pyobo/__init__.py index 8b512bb6..da268380 100644 --- a/src/pyobo/__init__.py +++ b/src/pyobo/__init__.py @@ -5,6 +5,7 @@ from .api import ( # noqa: F401 get_alts_to_id, get_ancestors, + get_children, get_definition, get_descendants, get_filtered_properties_df, @@ -12,6 +13,7 @@ get_filtered_properties_multimapping, get_filtered_relations_df, get_filtered_xrefs, + get_graph, get_hierarchy, get_id_definition_mapping, get_id_multirelations_mapping, diff --git a/src/pyobo/api/__init__.py b/src/pyobo/api/__init__.py index 808e171a..a1c01828 100644 --- a/src/pyobo/api/__init__.py +++ b/src/pyobo/api/__init__.py @@ -10,6 +10,7 @@ ) from .hierarchy import ( # noqa: F401 get_ancestors, + get_children, get_descendants, get_hierarchy, get_subhierarchy, @@ -38,6 +39,7 @@ ) from .relations import ( # noqa: F401 get_filtered_relations_df, + get_graph, get_id_multirelations_mapping, get_relation, get_relation_mapping, diff --git a/src/pyobo/api/hierarchy.py b/src/pyobo/api/hierarchy.py index 538da615..62638dbe 100644 --- a/src/pyobo/api/hierarchy.py +++ b/src/pyobo/api/hierarchy.py @@ -21,6 +21,7 @@ "get_ancestors", "has_ancestor", "is_descendent", + "get_children", ] from ..struct.reference import Reference @@ -189,6 +190,31 @@ def get_descendants( return nx.ancestors(hierarchy, curie) # note this is backwards +@lru_cache() +def get_children( + prefix: str, + identifier: str, + include_part_of: bool = True, + include_has_member: bool = False, + use_tqdm: bool = False, + force: bool = False, + **kwargs, +) -> Optional[Set[str]]: + """Get all of the descendants (children) of the term as CURIEs.""" + hierarchy = get_hierarchy( + prefix=prefix, + include_has_member=include_has_member, + include_part_of=include_part_of, + use_tqdm=use_tqdm, + force=force, + **kwargs, + ) + curie = f"{prefix}:{identifier}" + if curie not in hierarchy: + return None + return set(hierarchy.predecessors(curie)) + + def has_ancestor(prefix, identifier, ancestor_prefix, ancestor_identifier) -> bool: """Check that the first identifier has the second as an ancestor. diff --git a/src/pyobo/api/names.py b/src/pyobo/api/names.py index dd4b93ae..29b9d90d 100644 --- a/src/pyobo/api/names.py +++ b/src/pyobo/api/names.py @@ -4,7 +4,6 @@ import logging import subprocess -import zipfile from functools import lru_cache from typing import Callable, List, Mapping, Optional, Set, TypeVar @@ -59,6 +58,9 @@ def _help_get( logger.warning("[%s] unable to look up results with %s", prefix, f) NO_BUILD_PREFIXES.add(prefix) return None + except ValueError: + logger.warning("[%s] unable to look up results with %s", prefix, f) + return None if not mapping: if prefix not in NO_BUILD_PREFIXES: @@ -137,8 +139,8 @@ def _get_id_name_mapping() -> Mapping[str, str]: try: return _get_id_name_mapping() - except (zipfile.BadZipFile, subprocess.CalledProcessError): - logger.exception("[%s v%s] could not load", prefix, version) + except (Exception, subprocess.CalledProcessError) as e: + logger.exception("[%s v%s] could not load: %s", prefix, version, e) return {} diff --git a/src/pyobo/api/relations.py b/src/pyobo/api/relations.py index 10cfaa5a..1932592d 100644 --- a/src/pyobo/api/relations.py +++ b/src/pyobo/api/relations.py @@ -7,6 +7,7 @@ from functools import lru_cache from typing import List, Mapping, Optional +import networkx as nx import pandas as pd from .utils import get_version @@ -25,6 +26,15 @@ from ..utils.cache import cached_df from ..utils.path import prefix_cache_join +__all__ = [ + "get_relations_df", + "get_filtered_relations_df", + "get_id_multirelations_mapping", + "get_relation_mapping", + "get_relation", + "get_graph", +] + # TODO get_relation, get_relations logger = logging.getLogger(__name__) @@ -71,7 +81,7 @@ def get_filtered_relations_df( force: bool = False, version: Optional[str] = None, ) -> pd.DataFrame: - """Get all of the given relation.""" + """Get all the given relation.""" relation_prefix, relation_identifier = relation = get_reference_tuple(relation) if version is None: version = get_version(prefix) @@ -173,3 +183,16 @@ def get_relation( force=force, ) return relation_mapping.get(source_identifier) + + +def get_graph(prefix: str, **kwargs) -> nx.DiGraph: + """Get the relation graph.""" + rv = nx.MultiDiGraph() + df = get_relations_df(prefix=prefix, **kwargs) + for source_id, relation_prefix, relation_id, target_ns, target_id in df.values: + rv.add_edge( + f"{prefix}:{source_id}", + f"{target_ns}:{target_id}", + key=f"{relation_prefix}:{relation_id}", + ) + return rv diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py index 5145e344..bbbb4b8b 100644 --- a/src/pyobo/getters.py +++ b/src/pyobo/getters.py @@ -26,6 +26,7 @@ ) import bioregistry +from bioontologies import robot from tqdm.auto import tqdm from .constants import DATABASE_DIRECTORY @@ -120,8 +121,6 @@ def get_ontology( elif ontology_format == "obo": pass # all gucci elif ontology_format == "owl": - from bioontologies import robot - _converted_obo_path = path.with_suffix(".obo") if prefix in REQUIRES_NO_ROBOT_CHECK: robot_check = False diff --git a/src/pyobo/reader.py b/src/pyobo/reader.py index 89dc4ad1..a38913cf 100644 --- a/src/pyobo/reader.py +++ b/src/pyobo/reader.py @@ -50,7 +50,7 @@ def from_obo_path( - path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True + path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True, **kwargs ) -> Obo: """Get the OBO graph from a path.""" import obonet @@ -72,7 +72,7 @@ def from_obo_path( _clean_graph_ontology(graph, prefix) # Convert to an Obo instance and return - return from_obonet(graph, strict=strict) + return from_obonet(graph, strict=strict, **kwargs) def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> "Obo": # noqa:C901 @@ -574,7 +574,7 @@ def iterate_node_relationships( if relation_curie in RELATION_REMAPPINGS: relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie] else: - relation_prefix, relation_identifier = normalize_curie(relation_curie) + relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict) if relation_prefix is not None and relation_identifier is not None: relation = Reference(prefix=relation_prefix, identifier=relation_identifier) elif prefix is not None: diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py index d5e840d6..9f7b937e 100644 --- a/src/pyobo/sources/__init__.py +++ b/src/pyobo/sources/__init__.py @@ -31,9 +31,6 @@ from .kegg import KEGGGeneGetter, KEGGGenomeGetter, KEGGPathwayGetter from .mesh import MeSHGetter from .mgi import MGIGetter -from .mirbase import MiRBaseGetter -from .mirbase_family import MiRBaseFamilyGetter -from .mirbase_mature import MiRBaseMatureGetter from .msigdb import MSigDBGetter from .ncbigene import NCBIGeneGetter from .npass import NPASSGetter @@ -86,9 +83,9 @@ "MGIGetter", "MSigDBGetter", "MeSHGetter", - "MiRBaseFamilyGetter", - "MiRBaseGetter", - "MiRBaseMatureGetter", + # "MiRBaseFamilyGetter", + # "MiRBaseGetter", + # "MiRBaseMatureGetter", "NCBIGeneGetter", "NPASSGetter", "PIDGetter", diff --git a/src/pyobo/sources/drugbank.py b/src/pyobo/sources/drugbank.py index 51a07b39..fe567aac 100644 --- a/src/pyobo/sources/drugbank.py +++ b/src/pyobo/sources/drugbank.py @@ -15,6 +15,7 @@ import pystow from tqdm.auto import tqdm +from ..getters import NoBuild from ..struct import Obo, Reference, Term from ..struct.typedef import has_salt from ..utils.cache import cached_pickle @@ -145,12 +146,15 @@ def get_xml_root(version: Optional[str] = None) -> ElementTree.Element: Takes between 35-60 seconds. """ from drugbank_downloader import parse_drugbank + from pystow.config_api import ConfigError - element = parse_drugbank( - version=version, - username=pystow.get_config("pyobo", "drugbank_username"), - password=pystow.get_config("pyobo", "drugbank_password"), - ) + try: + username = pystow.get_config("pyobo", "drugbank_username", raise_on_missing=True) + password = pystow.get_config("pyobo", "drugbank_password", raise_on_missing=True) + except ConfigError as e: + raise NoBuild from e + + element = parse_drugbank(version=version, username=username, password=password) return element.getroot() diff --git a/src/pyobo/sources/icd_utils.py b/src/pyobo/sources/icd_utils.py index 8553d14c..74d52dd5 100644 --- a/src/pyobo/sources/icd_utils.py +++ b/src/pyobo/sources/icd_utils.py @@ -17,13 +17,12 @@ import pystow import requests from cachier import cachier +from pystow.config_api import ConfigError from tqdm.auto import tqdm +from ..getters import NoBuild from ..struct import Term -ICD_CLIENT_ID = pystow.get_config("pyobo", "icd_client_id") -ICD_CLIENT_SECRET = pystow.get_config("pyobo", "icd_client_secret") - TOKEN_URL = "https://icdaccessmanagement.who.int/connect/token" # noqa:S105 ICD_BASE_URL = "https://id.who.int/icd" @@ -52,10 +51,16 @@ def get_child_identifiers(endpoint: str, res_json: Mapping[str, Any]) -> List[st @cachier(stale_after=datetime.timedelta(minutes=45)) def get_icd_api_headers() -> Mapping[str, str]: """Get the headers, and refresh every hour.""" + try: + icd_client_id = pystow.get_config("pyobo", "icd_client_id", raise_on_missing=True) + icd_client_secret = pystow.get_config("pyobo", "icd_client_secret", raise_on_missing=True) + except ConfigError as e: + raise NoBuild from e + grant_type = "client_credentials" body_params = {"grant_type": grant_type} tqdm.write("getting ICD API token") - res = requests.post(TOKEN_URL, data=body_params, auth=(ICD_CLIENT_ID, ICD_CLIENT_SECRET)) + res = requests.post(TOKEN_URL, data=body_params, auth=(icd_client_id, icd_client_secret)) res_json = res.json() access_type = res_json["token_type"] access_token = res_json["access_token"] diff --git a/src/pyobo/sources/pid.py b/src/pyobo/sources/pid.py index 84f74814..92494a65 100644 --- a/src/pyobo/sources/pid.py +++ b/src/pyobo/sources/pid.py @@ -7,7 +7,6 @@ from typing import Iterable, List, Mapping, Tuple import pandas as pd -from protmapper.uniprot_client import get_gene_name, get_hgnc_id from ..api import get_id_name_mapping from ..struct import Obo, Reference, Term @@ -55,6 +54,8 @@ def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[Tuple def iter_terms(force: bool = False) -> Iterable[Term]: """Iterate over NCI PID terms.""" + from protmapper.uniprot_client import get_gene_name, get_hgnc_id + hgnc_id_to_name = get_id_name_mapping("hgnc") hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()} diff --git a/src/pyobo/sources/rgd.py b/src/pyobo/sources/rgd.py index 5b81cc9f..ef81aba1 100644 --- a/src/pyobo/sources/rgd.py +++ b/src/pyobo/sources/rgd.py @@ -106,7 +106,7 @@ def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Te force=force, version=version, quoting=3, - error_bad_lines=False, + on_bad_lines="skip", ) for _, row in tqdm( df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index 54281831..510f28e0 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -363,8 +363,10 @@ def extend_relationship(self, typedef: TypeDef, references: Iterable[Reference]) raise ValueError("can not extend a collection that includes a null reference") self.relationships[typedef].extend(references) - def append_property(self, prop: str, value: str) -> None: + def append_property(self, prop: Union[str, TypeDef], value: str) -> None: """Append a property.""" + if isinstance(prop, TypeDef): + prop = prop.curie self.properties[prop].append(value) def _definition_fp(self) -> str: diff --git a/src/pyobo/xrefdb/bengo.py b/src/pyobo/xrefdb/bengo.py deleted file mode 100644 index aec7a002..00000000 --- a/src/pyobo/xrefdb/bengo.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Pipeline for building a large ontology graph.""" - -import logging - -import bioregistry -import networkx as nx -from tqdm.auto import tqdm - -from pyobo import get_hierarchy -from pyobo.getters import SKIP -from pyobo.resource_utils import ensure_inspector_javert_df - -logger = logging.getLogger(__name__) - - -def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph: - """Make a super graph containing is_a, part_of, and xref relationships.""" - rv = nx.DiGraph() - - df = ensure_inspector_javert_df() - for source_ns, source_id, target_ns, target_id, provenance in df.values: - rv.add_edge( - f"{source_ns}:{source_id}", - f"{target_ns}:{target_id}", - relation="xref", - provenance=provenance, - ) - - logger.info("getting hierarchies") - it = tqdm(sorted(bioregistry.read_registry()), desc="Entries", disable=not use_tqdm) - for prefix in it: - if bioregistry.is_deprecated(prefix) or prefix in SKIP: - continue - if use_tqdm: - it.set_postfix({"prefix": prefix}) - - hierarchy = get_hierarchy(prefix, include_has_member=True, include_part_of=True) - rv.add_edges_from(hierarchy.edges(data=True)) - - # TODO include translates_to, transcribes_to, and has_variant - - return rv