Skip to content

Commit

Permalink
Fix pfam, rhea, and mirbase importers (#158)
Browse files Browse the repository at this point in the history
* Fix pfam, rhea, and mirbase importers

* Clean
  • Loading branch information
cthoyt authored Sep 9, 2023
1 parent 75f9ad7 commit 291aeb4
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 34 deletions.
14 changes: 11 additions & 3 deletions src/pyobo/sources/mirbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def get_obo(force: bool = False) -> Obo:
def get_terms(version: str, force: bool = False) -> List[Term]:
"""Parse miRNA data from filepath and convert it to dictionary."""
url = f"{BASE_URL}/{version}/miRNA.dat.gz"
definitions_path = ensure_path(PREFIX, url=url, version=version, force=force)
definitions_path = ensure_path(PREFIX, url=url, version=version, force=force, verify=False)

file_handle = (
gzip.open(definitions_path, "rt")
Expand All @@ -63,13 +63,21 @@ def get_terms(version: str, force: bool = False) -> List[Term]:

def _prepare_organisms(version: str, force: bool = False):
url = f"{BASE_URL}/{version}/organisms.txt.gz"
df = ensure_df(PREFIX, url=url, sep="\t", dtype={"#NCBI-taxid": str}, version=version)
df = ensure_df(
PREFIX,
url=url,
sep="\t",
dtype={"#NCBI-taxid": str},
version=version,
verify=False,
force=force,
)
return {division: (taxonomy_id, name) for _, division, name, _tree, taxonomy_id in df.values}


def _prepare_aliases(version: str, force: bool = False) -> Mapping[str, List[str]]:
url = f"{BASE_URL}/{version}/aliases.txt.gz"
df = ensure_df(PREFIX, url=url, sep="\t", version=version)
df = ensure_df(PREFIX, url=url, sep="\t", version=version, verify=False, force=force)
return {
mirbase_id: [s.strip() for s in synonyms.split(";") if s and s.strip()]
for mirbase_id, synonyms in df.values
Expand Down
1 change: 1 addition & 0 deletions src/pyobo/sources/pfam.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def get_pfam_clan_df(version: str, force: bool = False) -> pd.DataFrame:
version=version,
dtype=str,
force=force,
backend="urllib",
)


Expand Down
50 changes: 23 additions & 27 deletions src/pyobo/sources/rhea.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def get_obo(force: bool = False) -> Obo:

def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
"""Iterate over terms in Rhea."""
url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
graph = pystow.ensure_rdf(
"pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
)
result = graph.query(
"""
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
?reaction rdfs:subClassOf rh:Reaction .
?reaction rh:id ?reactionId .
?reaction rdfs:label ?reactionLabel .
}
"""
)
names = {str(identifier): name for _, identifier, name in result}

terms = {}

directions = ensure_df(
Expand All @@ -50,10 +66,12 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
force=force,
)
for master, lr, rl, bi in directions.values:
terms[master] = Term(reference=Reference(PREFIX, master))
terms[lr] = Term(reference=Reference(PREFIX, lr))
terms[rl] = Term(reference=Reference(PREFIX, rl))
terms[bi] = Term(reference=Reference(PREFIX, bi))
terms[master] = Term(
reference=Reference(prefix=PREFIX, identifier=master, name=names.get(master))
)
terms[lr] = Term(reference=Reference(prefix=PREFIX, identifier=lr, name=names.get(lr)))
terms[rl] = Term(reference=Reference(prefix=PREFIX, identifier=rl, name=names.get(rl)))
terms[bi] = Term(reference=Reference(prefix=PREFIX, identifier=bi, name=names.get(bi)))

terms[master].append_relationship(has_left_to_right_reaction, terms[lr])
terms[master].append_relationship(has_right_to_left_reaction, terms[rl])
Expand Down Expand Up @@ -97,33 +115,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
xref_id,
)
continue
terms[rhea_id].append_xref(Reference(xref_prefix, xref_id))
terms[rhea_id].append_xref(Reference(prefix=xref_prefix, identifier=xref_id))

# TODO are EC codes equivalent?
# TODO uniprot enabled by (RO:0002333)
# TODO names?

url = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
graph = pystow.ensure_rdf(
"pyobo", "raw", PREFIX, version, url=url, force=force, parse_kwargs=dict(format="xml")
)
result = graph.query(
"""
PREFIX rh:<http://rdf.rhea-db.org/>
SELECT ?reaction ?reactionId ?reactionLabel WHERE {
?reaction rdfs:subClassOf rh:Reaction .
?reaction rh:id ?reactionId .
?reaction rdfs:label ?reactionLabel .
}
"""
)
for _, identifier, name in result:
identifier = str(identifier)
if identifier not in terms:
logger.debug("isolated element in rdf: rhea:%s ! %s", identifier, name)
continue
terms[identifier].reference.name = name

# TODO participants?

yield from terms.values()
Expand Down
2 changes: 1 addition & 1 deletion src/pyobo/struct/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def _ensure_ref(reference: ReferenceHint) -> Reference:
raise ValueError
return _rv
if isinstance(reference, tuple):
return Reference(*reference)
return Reference(prefix=reference[0], identifier=reference[1])
if isinstance(reference, Reference):
return reference
raise TypeError
Expand Down
4 changes: 3 additions & 1 deletion src/pyobo/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from typing import Dict, Iterable, List, Mapping, Optional, Set, Tuple, TypeVar, Union
from xml.etree.ElementTree import Element

import pandas as pd
from lxml import etree
from tqdm.auto import tqdm

Expand Down Expand Up @@ -108,7 +109,8 @@ def multisetdict(pairs: Iterable[Tuple[X, Y]]) -> Dict[X, Set[Y]]:
"""Accumulate a multisetdict from a list of pairs."""
rv = defaultdict(set)
for key, value in pairs:
rv[key].add(value)
if pd.notna(value):
rv[key].add(value)
return dict(rv)


Expand Down
29 changes: 27 additions & 2 deletions src/pyobo/utils/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import logging
from pathlib import Path
from typing import Callable, Optional, Union
from typing import Any, Callable, Dict, Literal, Optional, Union

import pandas as pd
import requests_ftp
from pystow.utils import download, name_from_url, read_tarfile_csv

from .misc import cleanup_version
Expand All @@ -26,6 +27,8 @@

VersionHint = Union[None, str, Callable[[], str]]

requests_ftp.monkeypatch_session()


def prefix_directory_join(
prefix: str,
Expand Down Expand Up @@ -62,6 +65,8 @@ def ensure_path(
name: Optional[str] = None,
force: bool = False,
error_on_missing: bool = False,
backend: Literal["requests", "urllib"] = "urllib",
verify: bool = True,
) -> str:
"""Download a file if it doesn't exist."""
if name is None:
Expand All @@ -72,10 +77,19 @@ def ensure_path(
if not path.exists() and error_on_missing:
raise FileNotFoundError

kwargs: Dict[str, Any]
if verify:
kwargs = {"backend": backend}
else:
if backend != "requests":
logger.warning("using requests since verify=False")
kwargs = {"backend": "requests", "verify": False}

download(
url=url,
path=path,
force=force,
**kwargs,
)
return path.as_posix()

Expand All @@ -89,10 +103,21 @@ def ensure_df(
force: bool = False,
sep: str = "\t",
dtype=str,
verify: bool = True,
backend: Literal["requests", "urllib"] = "urllib",
**kwargs,
) -> pd.DataFrame:
"""Download a file and open as a dataframe."""
_path = ensure_path(prefix, *parts, url=url, version=version, name=name, force=force)
_path = ensure_path(
prefix,
*parts,
url=url,
version=version,
name=name,
force=force,
verify=verify,
backend=backend,
)
return pd.read_csv(_path, sep=sep, dtype=dtype, **kwargs)


Expand Down

0 comments on commit 291aeb4

Please sign in to comment.