From db05ef4c465732afbbbb89436d5f24e806d294c1 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 26 Sep 2023 14:50:08 +0200
Subject: [PATCH] Various cleanup for large scale database generation (#160)

* Updates for semra build

* Add high-level getter functions

* Remove deprecated code

* Fix bad line handling

* Generalize exception handling

* Add back missing exception

* Update struct.py

* miRBase is broken.

* Add kwargs to from_obo_path

* Enable strict passing to relation curie parsing

* Update setup.cfg

* Update relations.py
---
 setup.cfg                      |  2 ++
 src/pyobo/__init__.py          |  2 ++
 src/pyobo/api/__init__.py      |  2 ++
 src/pyobo/api/hierarchy.py     | 26 ++++++++++++++++++++
 src/pyobo/api/names.py         |  8 ++++---
 src/pyobo/api/relations.py     | 25 ++++++++++++++++++-
 src/pyobo/getters.py           |  3 +--
 src/pyobo/reader.py            |  6 ++---
 src/pyobo/sources/__init__.py  |  9 +++----
 src/pyobo/sources/drugbank.py  | 14 +++++++----
 src/pyobo/sources/icd_utils.py | 13 ++++++----
 src/pyobo/sources/pid.py       |  3 ++-
 src/pyobo/sources/rgd.py       |  2 +-
 src/pyobo/struct/struct.py     |  4 +++-
 src/pyobo/xrefdb/bengo.py      | 44 ----------------------------------
 15 files changed, 92 insertions(+), 71 deletions(-)
 delete mode 100644 src/pyobo/xrefdb/bengo.py

diff --git a/setup.cfg b/setup.cfg
index 45672a0f..fd81ee9c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -173,6 +173,8 @@ ignore =
 # Pickle stuff
     S301
     S403
+    # too complicated
+    C901
 exclude =
     .tox,
     .git,
diff --git a/src/pyobo/__init__.py b/src/pyobo/__init__.py
index 8b512bb6..da268380 100644
--- a/src/pyobo/__init__.py
+++ b/src/pyobo/__init__.py
@@ -5,6 +5,7 @@
 from .api import (  # noqa: F401
     get_alts_to_id,
     get_ancestors,
+    get_children,
     get_definition,
     get_descendants,
     get_filtered_properties_df,
@@ -12,6 +13,7 @@
     get_filtered_properties_multimapping,
     get_filtered_relations_df,
     get_filtered_xrefs,
+    get_graph,
     get_hierarchy,
     get_id_definition_mapping,
     get_id_multirelations_mapping,
diff --git a/src/pyobo/api/__init__.py b/src/pyobo/api/__init__.py
index 808e171a..a1c01828 100644
--- a/src/pyobo/api/__init__.py
+++ b/src/pyobo/api/__init__.py
@@ -10,6 +10,7 @@
 )
 from .hierarchy import (  # noqa: F401
     get_ancestors,
+    get_children,
     get_descendants,
     get_hierarchy,
     get_subhierarchy,
@@ -38,6 +39,7 @@
 )
 from .relations import (  # noqa: F401
     get_filtered_relations_df,
+    get_graph,
     get_id_multirelations_mapping,
     get_relation,
     get_relation_mapping,
diff --git a/src/pyobo/api/hierarchy.py b/src/pyobo/api/hierarchy.py
index 538da615..62638dbe 100644
--- a/src/pyobo/api/hierarchy.py
+++ b/src/pyobo/api/hierarchy.py
@@ -21,6 +21,7 @@
     "get_ancestors",
     "has_ancestor",
     "is_descendent",
+    "get_children",
 ]
 
 from ..struct.reference import Reference
@@ -189,6 +190,31 @@ def get_descendants(
     return nx.ancestors(hierarchy, curie)  # note this is backwards
 
 
+@lru_cache()
+def get_children(
+    prefix: str,
+    identifier: str,
+    include_part_of: bool = True,
+    include_has_member: bool = False,
+    use_tqdm: bool = False,
+    force: bool = False,
+    **kwargs,
+) -> Optional[Set[str]]:
+    """Get all of the descendants (children) of the term as CURIEs."""
+    hierarchy = get_hierarchy(
+        prefix=prefix,
+        include_has_member=include_has_member,
+        include_part_of=include_part_of,
+        use_tqdm=use_tqdm,
+        force=force,
+        **kwargs,
+    )
+    curie = f"{prefix}:{identifier}"
+    if curie not in hierarchy:
+        return None
+    return set(hierarchy.predecessors(curie))
+
+
 def has_ancestor(prefix, identifier, ancestor_prefix, ancestor_identifier) -> bool:
     """Check that the first identifier has the second as an ancestor.
 
diff --git a/src/pyobo/api/names.py b/src/pyobo/api/names.py
index dd4b93ae..29b9d90d 100644
--- a/src/pyobo/api/names.py
+++ b/src/pyobo/api/names.py
@@ -4,7 +4,6 @@
 
 import logging
 import subprocess
-import zipfile
 from functools import lru_cache
 from typing import Callable, List, Mapping, Optional, Set, TypeVar
 
@@ -59,6 +58,9 @@ def _help_get(
             logger.warning("[%s] unable to look up results with %s", prefix, f)
             NO_BUILD_PREFIXES.add(prefix)
         return None
+    except ValueError:
+        logger.warning("[%s] unable to look up results with %s", prefix, f)
+        return None
 
     if not mapping:
         if prefix not in NO_BUILD_PREFIXES:
@@ -137,8 +139,8 @@ def _get_id_name_mapping() -> Mapping[str, str]:
 
     try:
         return _get_id_name_mapping()
-    except (zipfile.BadZipFile, subprocess.CalledProcessError):
-        logger.exception("[%s v%s] could not load", prefix, version)
+    except (Exception, subprocess.CalledProcessError) as e:
+        logger.exception("[%s v%s] could not load: %s", prefix, version, e)
         return {}
 
 
diff --git a/src/pyobo/api/relations.py b/src/pyobo/api/relations.py
index 10cfaa5a..1932592d 100644
--- a/src/pyobo/api/relations.py
+++ b/src/pyobo/api/relations.py
@@ -7,6 +7,7 @@
 from functools import lru_cache
 from typing import List, Mapping, Optional
 
+import networkx as nx
 import pandas as pd
 
 from .utils import get_version
@@ -25,6 +26,15 @@
 from ..utils.cache import cached_df
 from ..utils.path import prefix_cache_join
 
+__all__ = [
+    "get_relations_df",
+    "get_filtered_relations_df",
+    "get_id_multirelations_mapping",
+    "get_relation_mapping",
+    "get_relation",
+    "get_graph",
+]
+
 # TODO get_relation, get_relations
 
 logger = logging.getLogger(__name__)
@@ -71,7 +81,7 @@ def get_filtered_relations_df(
     force: bool = False,
     version: Optional[str] = None,
 ) -> pd.DataFrame:
-    """Get all of the given relation."""
+    """Get all the given relation."""
     relation_prefix, relation_identifier = relation = get_reference_tuple(relation)
     if version is None:
         version = get_version(prefix)
@@ -173,3 +183,16 @@ def get_relation(
         force=force,
     )
     return relation_mapping.get(source_identifier)
+
+
+def get_graph(prefix: str, **kwargs) -> nx.DiGraph:
+    """Get the relation graph."""
+    rv = nx.MultiDiGraph()
+    df = get_relations_df(prefix=prefix, **kwargs)
+    for source_id, relation_prefix, relation_id, target_ns, target_id in df.values:
+        rv.add_edge(
+            f"{prefix}:{source_id}",
+            f"{target_ns}:{target_id}",
+            key=f"{relation_prefix}:{relation_id}",
+        )
+    return rv
diff --git a/src/pyobo/getters.py b/src/pyobo/getters.py
index 5145e344..bbbb4b8b 100644
--- a/src/pyobo/getters.py
+++ b/src/pyobo/getters.py
@@ -26,6 +26,7 @@
 )
 
 import bioregistry
+from bioontologies import robot
 from tqdm.auto import tqdm
 
 from .constants import DATABASE_DIRECTORY
@@ -120,8 +121,6 @@ def get_ontology(
     elif ontology_format == "obo":
         pass  # all gucci
     elif ontology_format == "owl":
-        from bioontologies import robot
-
         _converted_obo_path = path.with_suffix(".obo")
         if prefix in REQUIRES_NO_ROBOT_CHECK:
             robot_check = False
diff --git a/src/pyobo/reader.py b/src/pyobo/reader.py
index 89dc4ad1..a38913cf 100644
--- a/src/pyobo/reader.py
+++ b/src/pyobo/reader.py
@@ -50,7 +50,7 @@
 
 
 def from_obo_path(
-    path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True
+    path: Union[str, Path], prefix: Optional[str] = None, *, strict: bool = True, **kwargs
 ) -> Obo:
     """Get the OBO graph from a path."""
     import obonet
@@ -72,7 +72,7 @@ def from_obo_path(
         _clean_graph_ontology(graph, prefix)
 
     # Convert to an Obo instance and return
-    return from_obonet(graph, strict=strict)
+    return from_obonet(graph, strict=strict, **kwargs)
 
 
 def from_obonet(graph: nx.MultiDiGraph, *, strict: bool = True) -> "Obo":  # noqa:C901
@@ -574,7 +574,7 @@ def iterate_node_relationships(
         if relation_curie in RELATION_REMAPPINGS:
             relation_prefix, relation_identifier = RELATION_REMAPPINGS[relation_curie]
         else:
-            relation_prefix, relation_identifier = normalize_curie(relation_curie)
+            relation_prefix, relation_identifier = normalize_curie(relation_curie, strict=strict)
         if relation_prefix is not None and relation_identifier is not None:
             relation = Reference(prefix=relation_prefix, identifier=relation_identifier)
         elif prefix is not None:
diff --git a/src/pyobo/sources/__init__.py b/src/pyobo/sources/__init__.py
index d5e840d6..9f7b937e 100644
--- a/src/pyobo/sources/__init__.py
+++ b/src/pyobo/sources/__init__.py
@@ -31,9 +31,6 @@
 from .kegg import KEGGGeneGetter, KEGGGenomeGetter, KEGGPathwayGetter
 from .mesh import MeSHGetter
 from .mgi import MGIGetter
-from .mirbase import MiRBaseGetter
-from .mirbase_family import MiRBaseFamilyGetter
-from .mirbase_mature import MiRBaseMatureGetter
 from .msigdb import MSigDBGetter
 from .ncbigene import NCBIGeneGetter
 from .npass import NPASSGetter
@@ -86,9 +83,9 @@
     "MGIGetter",
     "MSigDBGetter",
     "MeSHGetter",
-    "MiRBaseFamilyGetter",
-    "MiRBaseGetter",
-    "MiRBaseMatureGetter",
+    # "MiRBaseFamilyGetter",
+    # "MiRBaseGetter",
+    # "MiRBaseMatureGetter",
     "NCBIGeneGetter",
     "NPASSGetter",
     "PIDGetter",
diff --git a/src/pyobo/sources/drugbank.py b/src/pyobo/sources/drugbank.py
index 51a07b39..fe567aac 100644
--- a/src/pyobo/sources/drugbank.py
+++ b/src/pyobo/sources/drugbank.py
@@ -15,6 +15,7 @@
 import pystow
 from tqdm.auto import tqdm
 
+from ..getters import NoBuild
 from ..struct import Obo, Reference, Term
 from ..struct.typedef import has_salt
 from ..utils.cache import cached_pickle
@@ -145,12 +146,15 @@ def get_xml_root(version: Optional[str] = None) -> ElementTree.Element:
     Takes between 35-60 seconds.
     """
     from drugbank_downloader import parse_drugbank
+    from pystow.config_api import ConfigError
 
-    element = parse_drugbank(
-        version=version,
-        username=pystow.get_config("pyobo", "drugbank_username"),
-        password=pystow.get_config("pyobo", "drugbank_password"),
-    )
+    try:
+        username = pystow.get_config("pyobo", "drugbank_username", raise_on_missing=True)
+        password = pystow.get_config("pyobo", "drugbank_password", raise_on_missing=True)
+    except ConfigError as e:
+        raise NoBuild from e
+
+    element = parse_drugbank(version=version, username=username, password=password)
     return element.getroot()
 
 
diff --git a/src/pyobo/sources/icd_utils.py b/src/pyobo/sources/icd_utils.py
index 8553d14c..74d52dd5 100644
--- a/src/pyobo/sources/icd_utils.py
+++ b/src/pyobo/sources/icd_utils.py
@@ -17,13 +17,12 @@
 import pystow
 import requests
 from cachier import cachier
+from pystow.config_api import ConfigError
 from tqdm.auto import tqdm
 
+from ..getters import NoBuild
 from ..struct import Term
 
-ICD_CLIENT_ID = pystow.get_config("pyobo", "icd_client_id")
-ICD_CLIENT_SECRET = pystow.get_config("pyobo", "icd_client_secret")
-
 TOKEN_URL = "https://icdaccessmanagement.who.int/connect/token"  # noqa:S105
 
 ICD_BASE_URL = "https://id.who.int/icd"
@@ -52,10 +51,16 @@ def get_child_identifiers(endpoint: str, res_json: Mapping[str, Any]) -> List[st
 @cachier(stale_after=datetime.timedelta(minutes=45))
 def get_icd_api_headers() -> Mapping[str, str]:
     """Get the headers, and refresh every hour."""
+    try:
+        icd_client_id = pystow.get_config("pyobo", "icd_client_id", raise_on_missing=True)
+        icd_client_secret = pystow.get_config("pyobo", "icd_client_secret", raise_on_missing=True)
+    except ConfigError as e:
+        raise NoBuild from e
+
     grant_type = "client_credentials"
     body_params = {"grant_type": grant_type}
     tqdm.write("getting ICD API token")
-    res = requests.post(TOKEN_URL, data=body_params, auth=(ICD_CLIENT_ID, ICD_CLIENT_SECRET))
+    res = requests.post(TOKEN_URL, data=body_params, auth=(icd_client_id, icd_client_secret))
     res_json = res.json()
     access_type = res_json["token_type"]
     access_token = res_json["access_token"]
diff --git a/src/pyobo/sources/pid.py b/src/pyobo/sources/pid.py
index 84f74814..92494a65 100644
--- a/src/pyobo/sources/pid.py
+++ b/src/pyobo/sources/pid.py
@@ -7,7 +7,6 @@
 from typing import Iterable, List, Mapping, Tuple
 
 import pandas as pd
-from protmapper.uniprot_client import get_gene_name, get_hgnc_id
 
 from ..api import get_id_name_mapping
 from ..struct import Obo, Reference, Term
@@ -55,6 +54,8 @@ def iter_networks(use_tqdm: bool = False, force: bool = False) -> Iterable[Tuple
 
 def iter_terms(force: bool = False) -> Iterable[Term]:
     """Iterate over NCI PID terms."""
+    from protmapper.uniprot_client import get_gene_name, get_hgnc_id
+
     hgnc_id_to_name = get_id_name_mapping("hgnc")
     hgnc_name_to_id = {v: k for k, v in hgnc_id_to_name.items()}
 
diff --git a/src/pyobo/sources/rgd.py b/src/pyobo/sources/rgd.py
index 5b81cc9f..ef81aba1 100644
--- a/src/pyobo/sources/rgd.py
+++ b/src/pyobo/sources/rgd.py
@@ -106,7 +106,7 @@ def get_terms(force: bool = False, version: Optional[str] = None) -> Iterable[Te
         force=force,
         version=version,
         quoting=3,
-        error_bad_lines=False,
+        on_bad_lines="skip",
     )
     for _, row in tqdm(
         df.iterrows(), total=len(df.index), desc=f"Mapping {PREFIX}", unit_scale=True
diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py
index 54281831..510f28e0 100644
--- a/src/pyobo/struct/struct.py
+++ b/src/pyobo/struct/struct.py
@@ -363,8 +363,10 @@ def extend_relationship(self, typedef: TypeDef, references: Iterable[Reference])
             raise ValueError("can not extend a collection that includes a null reference")
         self.relationships[typedef].extend(references)
 
-    def append_property(self, prop: str, value: str) -> None:
+    def append_property(self, prop: Union[str, TypeDef], value: str) -> None:
         """Append a property."""
+        if isinstance(prop, TypeDef):
+            prop = prop.curie
         self.properties[prop].append(value)
 
     def _definition_fp(self) -> str:
diff --git a/src/pyobo/xrefdb/bengo.py b/src/pyobo/xrefdb/bengo.py
deleted file mode 100644
index aec7a002..00000000
--- a/src/pyobo/xrefdb/bengo.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""Pipeline for building a large ontology graph."""
-
-import logging
-
-import bioregistry
-import networkx as nx
-from tqdm.auto import tqdm
-
-from pyobo import get_hierarchy
-from pyobo.getters import SKIP
-from pyobo.resource_utils import ensure_inspector_javert_df
-
-logger = logging.getLogger(__name__)
-
-
-def bens_magical_ontology(use_tqdm: bool = True) -> nx.DiGraph:
-    """Make a super graph containing is_a, part_of, and xref relationships."""
-    rv = nx.DiGraph()
-
-    df = ensure_inspector_javert_df()
-    for source_ns, source_id, target_ns, target_id, provenance in df.values:
-        rv.add_edge(
-            f"{source_ns}:{source_id}",
-            f"{target_ns}:{target_id}",
-            relation="xref",
-            provenance=provenance,
-        )
-
-    logger.info("getting hierarchies")
-    it = tqdm(sorted(bioregistry.read_registry()), desc="Entries", disable=not use_tqdm)
-    for prefix in it:
-        if bioregistry.is_deprecated(prefix) or prefix in SKIP:
-            continue
-        if use_tqdm:
-            it.set_postfix({"prefix": prefix})
-
-        hierarchy = get_hierarchy(prefix, include_has_member=True, include_part_of=True)
-        rv.add_edges_from(hierarchy.edges(data=True))
-
-    # TODO include translates_to, transcribes_to, and has_variant
-
-    return rv