diff --git a/src/pyobo/sources/mirbase.py b/src/pyobo/sources/mirbase.py index 52c43fec..45233568 100644 --- a/src/pyobo/sources/mirbase.py +++ b/src/pyobo/sources/mirbase.py @@ -50,7 +50,7 @@ def get_obo(force: bool = False) -> Obo: def get_terms(version: str, force: bool = False) -> List[Term]: """Parse miRNA data from filepath and convert it to dictionary.""" url = f"{BASE_URL}/{version}/miRNA.dat.gz" - definitions_path = ensure_path(PREFIX, url=url, version=version, force=force) + definitions_path = ensure_path(PREFIX, url=url, version=version, force=force, verify=False) file_handle = ( gzip.open(definitions_path, "rt") @@ -63,13 +63,21 @@ def get_terms(version: str, force: bool = False) -> List[Term]: def _prepare_organisms(version: str, force: bool = False): url = f"{BASE_URL}/{version}/organisms.txt.gz" - df = ensure_df(PREFIX, url=url, sep="\t", dtype={"#NCBI-taxid": str}, version=version) + df = ensure_df( + PREFIX, + url=url, + sep="\t", + dtype={"#NCBI-taxid": str}, + version=version, + verify=False, + force=force, + ) return {division: (taxonomy_id, name) for _, division, name, _tree, taxonomy_id in df.values} def _prepare_aliases(version: str, force: bool = False) -> Mapping[str, List[str]]: url = f"{BASE_URL}/{version}/aliases.txt.gz" - df = ensure_df(PREFIX, url=url, sep="\t", version=version) + df = ensure_df(PREFIX, url=url, sep="\t", version=version, verify=False, force=force) return { mirbase_id: [s.strip() for s in synonyms.split(";") if s and s.strip()] for mirbase_id, synonyms in df.values diff --git a/src/pyobo/sources/pfam.py b/src/pyobo/sources/pfam.py index 5ca2ba44..4df02096 100644 --- a/src/pyobo/sources/pfam.py +++ b/src/pyobo/sources/pfam.py @@ -35,6 +35,7 @@ def get_pfam_clan_df(version: str, force: bool = False) -> pd.DataFrame: version=version, dtype=str, force=force, + backend="urllib", ) diff --git a/src/pyobo/sources/rhea.py b/src/pyobo/sources/rhea.py index f7c94993..d680d395 100644 --- a/src/pyobo/sources/rhea.py +++ b/src/pyobo/sources/rhea.py @@ -41,6 +41,22 @@ def get_obo(force: bool = False) -> Obo: def iter_terms(version: str, force: bool = False) -> Iterable[Term]: """Iterate over terms in Rhea.""" + url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz" + graph = pystow.ensure_rdf( + "pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml") + ) + result = graph.query( + """ + PREFIX rh: + SELECT ?reaction ?reactionId ?reactionLabel WHERE { + ?reaction rdfs:subClassOf rh:Reaction . + ?reaction rh:id ?reactionId . + ?reaction rdfs:label ?reactionLabel . + } + """ + ) + names = {str(identifier): name for _, identifier, name in result} + terms = {} directions = ensure_df( @@ -50,10 +66,12 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: force=force, ) for master, lr, rl, bi in directions.values: - terms[master] = Term(reference=Reference(PREFIX, master)) - terms[lr] = Term(reference=Reference(PREFIX, lr)) - terms[rl] = Term(reference=Reference(PREFIX, rl)) - terms[bi] = Term(reference=Reference(PREFIX, bi)) + terms[master] = Term( + reference=Reference(prefix=PREFIX, identifier=master, name=names.get(master)) + ) + terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=names.get(lr))) + terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=names.get(rl))) + terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=names.get(bi))) terms[master].append_relationship(has_left_to_right_reaction, terms[lr]) terms[master].append_relationship(has_right_to_left_reaction, terms[rl]) @@ -97,33 +115,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]: xref_id, ) continue - terms[rhea_id].append_xref(Reference(xref_prefix, xref_id)) + terms[rhea_id].append_xref(Reference(prefix=xref_prefix, identifier=xref_id)) # TODO are EC codes equivalent? # TODO uniprot enabled by (RO:0002333) # TODO names? - - url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz" - graph = pystow.ensure_rdf( - "pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml") - ) - result = graph.query( - """ - PREFIX rh: - SELECT ?reaction ?reactionId ?reactionLabel WHERE { - ?reaction rdfs:subClassOf rh:Reaction . - ?reaction rh:id ?reactionId . - ?reaction rdfs:label ?reactionLabel . - } - """ - ) - for _, identifier, name in result: - identifier = str(identifier) - if identifier not in terms: - logger.debug("isolated element in rdf: rhea:%s ! %s", identifier, name) - continue - terms[identifier].reference.name = name - # TODO participants? yield from terms.values() diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index f73e5989..54281831 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -172,7 +172,7 @@ def _ensure_ref(reference: ReferenceHint) -> Reference: raise ValueError return _rv if isinstance(reference, tuple): - return Reference(*reference) + return Reference(prefix=reference[0], identifier=reference[1]) if isinstance(reference, Reference): return reference raise TypeError diff --git a/src/pyobo/utils/io.py b/src/pyobo/utils/io.py index 414b5f4e..d9e0f1d5 100644 --- a/src/pyobo/utils/io.py +++ b/src/pyobo/utils/io.py @@ -13,6 +13,7 @@ from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple, TypeVar, Union from xml.etree.ElementTree import Element +import pandas as pd from lxml import etree from tqdm.auto import tqdm @@ -108,7 +109,8 @@ def multisetdict(pairs: Iterable[Tuple[X, Y]]) -> Dict[X, Set[Y]]: """Accumulate a multisetdict from a list of pairs.""" rv = defaultdict(set) for key, value in pairs: - rv[key].add(value) + if pd.notna(value): + rv[key].add(value) return dict(rv) diff --git a/src/pyobo/utils/path.py b/src/pyobo/utils/path.py index b4f30515..b5e27971 100644 --- a/src/pyobo/utils/path.py +++ b/src/pyobo/utils/path.py @@ -4,9 +4,10 @@ import logging from pathlib import Path -from typing import Callable, Optional, Union +from typing import Any, Callable, Dict, Literal, Optional, Union import pandas as pd +import requests_ftp from pystow.utils import download, name_from_url, read_tarfile_csv from .misc import cleanup_version @@ -26,6 +27,8 @@ VersionHint = Union[None, str, Callable[[], str]] +requests_ftp.monkeypatch_session() + def prefix_directory_join( prefix: str, @@ -62,6 +65,8 @@ def ensure_path( name: Optional[str] = None, force: bool = False, error_on_missing: bool = False, + backend: Literal["requests", "urllib"] = "urllib", + verify: bool = True, ) -> str: """Download a file if it doesn't exist.""" if name is None: @@ -72,10 +77,19 @@ def ensure_path( if not path.exists() and error_on_missing: raise FileNotFoundError + kwargs: Dict[str, Any] + if verify: + kwargs = {"backend": backend} + else: + if backend != "requests": + logger.warning("using requests since verify=False") + kwargs = {"backend": "requests", "verify": False} + download( url=url, path=path, force=force, + **kwargs, ) return path.as_posix() @@ -89,10 +103,21 @@ def ensure_df( force: bool = False, sep: str = "\t", dtype=str, + verify: bool = True, + backend: Literal["requests", "urllib"] = "urllib", **kwargs, ) -> pd.DataFrame: """Download a file and open as a dataframe.""" - _path = ensure_path(prefix, *parts, url=url, version=version, name=name, force=force) + _path = ensure_path( + prefix, + *parts, + url=url, + version=version, + name=name, + force=force, + verify=verify, + backend=backend, + ) return pd.read_csv(_path, sep=sep, dtype=dtype, **kwargs)