diff --git a/src/pyobo/api/alts.py b/src/pyobo/api/alts.py index 512e21e8..6122c7cf 100644 --- a/src/pyobo/api/alts.py +++ b/src/pyobo/api/alts.py @@ -28,12 +28,15 @@ @lru_cache() @wrap_norm_prefix -def get_id_to_alts(prefix: str, force: bool = False) -> Mapping[str, List[str]]: +def get_id_to_alts( + prefix: str, *, force: bool = False, version: Optional[str] = None +) -> Mapping[str, List[str]]: """Get alternate identifiers.""" if prefix in NO_ALTS: return {} - version = get_version(prefix) + if version is None: + version = get_version(prefix) path = prefix_cache_join(prefix, name="alt_ids.tsv", version=version) header = [f"{prefix}_id", "alt_id"] @@ -51,26 +54,28 @@ def _get_mapping() -> Mapping[str, List[str]]: @lru_cache() @wrap_norm_prefix -def get_alts_to_id(prefix: str, force: bool = False) -> Mapping[str, str]: +def get_alts_to_id( + prefix: str, *, force: bool = False, version: Optional[str] = None +) -> Mapping[str, str]: """Get alternative id to primary id mapping.""" return { alt: primary - for primary, alts in get_id_to_alts(prefix, force=force).items() + for primary, alts in get_id_to_alts(prefix, force=force, version=version).items() for alt in alts } -def get_primary_curie(curie: str) -> Optional[str]: +def get_primary_curie(curie: str, *, version: Optional[str] = None) -> Optional[str]: """Get the primary curie for an entity.""" prefix, identifier = normalize_curie(curie) - primary_identifier = get_primary_identifier(prefix, identifier) + primary_identifier = get_primary_identifier(prefix, identifier, version=version) if primary_identifier is not None: return f"{prefix}:{primary_identifier}" return None @wrap_norm_prefix -def get_primary_identifier(prefix: str, identifier: str) -> str: +def get_primary_identifier(prefix: str, identifier: str, *, version: Optional[str] = None) -> str: """Get the primary identifier for an entity. :param prefix: The name of the resource @@ -82,7 +87,7 @@ def get_primary_identifier(prefix: str, identifier: str) -> str: if prefix in NO_ALTS: # TODO later expand list to other namespaces with no alts return identifier - alts_to_id = get_alts_to_id(prefix) + alts_to_id = get_alts_to_id(prefix, version=version) if alts_to_id and identifier in alts_to_id: return alts_to_id[identifier] return identifier diff --git a/src/pyobo/api/hierarchy.py b/src/pyobo/api/hierarchy.py index 8f7bd57f..095cfbec 100644 --- a/src/pyobo/api/hierarchy.py +++ b/src/pyobo/api/hierarchy.py @@ -13,6 +13,7 @@ from .relations import get_filtered_relations_df from ..identifier_utils import wrap_norm_prefix from ..struct import TypeDef, has_member, is_a, part_of +from ..struct.reference import Reference __all__ = [ "get_hierarchy", @@ -24,7 +25,6 @@ "get_children", ] -from ..struct.reference import Reference logger = logging.getLogger(__name__) @@ -154,14 +154,16 @@ def _get_hierarchy_helper( return rv -def is_descendent(prefix, identifier, ancestor_prefix, ancestor_identifier) -> bool: +def is_descendent( + prefix, identifier, ancestor_prefix, ancestor_identifier, *, version: Optional[str] = None +) -> bool: """Check that the first identifier has the second as a descendent. Check that go:0070246 ! natural killer cell apoptotic process is a descendant of go:0006915 ! apoptotic process:: >>> assert is_descendent('go', '0070246', 'go', '0006915') """ - descendants = get_descendants(ancestor_prefix, ancestor_identifier) + descendants = get_descendants(ancestor_prefix, ancestor_identifier, version=version) return descendants is not None and f"{prefix}:{identifier}" in descendants @@ -224,13 +226,15 @@ def get_children( return set(hierarchy.predecessors(curie)) -def has_ancestor(prefix, identifier, ancestor_prefix, ancestor_identifier) -> bool: +def has_ancestor( + prefix, identifier, ancestor_prefix, ancestor_identifier, *, version: Optional[str] = None +) -> bool: """Check that the first identifier has the second as an ancestor. Check that go:0008219 ! cell death is an ancestor of go:0006915 ! apoptotic process:: >>> assert has_ancestor('go', '0006915', 'go', '0008219') """ - ancestors = get_ancestors(prefix, identifier) + ancestors = get_ancestors(prefix, identifier, version=version) return ancestors is not None and f"{ancestor_prefix}:{ancestor_identifier}" in ancestors diff --git a/src/pyobo/api/metadata.py b/src/pyobo/api/metadata.py index d31c1618..641f0b02 100644 --- a/src/pyobo/api/metadata.py +++ b/src/pyobo/api/metadata.py @@ -4,7 +4,7 @@ import logging from functools import lru_cache -from typing import Mapping +from typing import Mapping, Optional from .utils import get_version from ..getters import get_ontology @@ -21,9 +21,12 @@ @lru_cache() @wrap_norm_prefix -def get_metadata(prefix: str, force: bool = False) -> Mapping[str, str]: +def get_metadata( + prefix: str, *, force: bool = False, version: Optional[str] = None +) -> Mapping[str, str]: """Get metadata for the ontology.""" - version = get_version(prefix) + if version is None: + version = get_version(prefix) path = prefix_cache_join(prefix, name="metadata.json", version=version) @cached_json(path=path, force=force) diff --git a/src/pyobo/api/names.py b/src/pyobo/api/names.py index 2233dca1..4c268a77 100644 --- a/src/pyobo/api/names.py +++ b/src/pyobo/api/names.py @@ -69,7 +69,7 @@ def _help_get( NO_BUILD_PREFIXES.add(prefix) return None - primary_id = get_primary_identifier(prefix, identifier) + primary_id = get_primary_identifier(prefix, identifier, version=version) return mapping.get(primary_id) @@ -82,7 +82,7 @@ def get_name(prefix: str, identifier: str, *, version: Optional[str] = None) -> @lru_cache() @wrap_norm_prefix def get_ids( - prefix: str, force: bool = False, strict: bool = False, version: Optional[str] = None + prefix: str, *, force: bool = False, strict: bool = False, version: Optional[str] = None ) -> Set[str]: """Get the set of identifiers for this prefix.""" if prefix == "ncbigene": @@ -150,16 +150,18 @@ def _get_id_name_mapping() -> Mapping[str, str]: @lru_cache() @wrap_norm_prefix -def get_name_id_mapping(prefix: str, force: bool = False) -> Mapping[str, str]: +def get_name_id_mapping( + prefix: str, *, force: bool = False, version: Optional[str] = None +) -> Mapping[str, str]: """Get a name to identifier mapping for the OBO file.""" - id_name = get_id_name_mapping(prefix=prefix, force=force) + id_name = get_id_name_mapping(prefix=prefix, force=force, version=version) return {v: k for k, v in id_name.items()} @wrap_norm_prefix -def get_definition(prefix: str, identifier: str) -> Optional[str]: +def get_definition(prefix: str, identifier: str, *, version: Optional[str] = None) -> Optional[str]: """Get the definition for an entity.""" - return _help_get(get_id_definition_mapping, prefix, identifier) + return _help_get(get_id_definition_mapping, prefix, identifier, version=version) def get_id_definition_mapping( diff --git a/src/pyobo/api/relations.py b/src/pyobo/api/relations.py index 1932592d..8472748e 100644 --- a/src/pyobo/api/relations.py +++ b/src/pyobo/api/relations.py @@ -48,9 +48,11 @@ def get_relations_df( force: bool = False, wide: bool = False, strict: bool = True, + version: Optional[str] = None, ) -> pd.DataFrame: """Get all relations from the OBO.""" - version = get_version(prefix) + if version is None: + version = get_version(prefix) path = prefix_cache_join(prefix, name="relations.tsv", version=version) @cached_df(path=path, dtype=str, force=force) @@ -118,9 +120,11 @@ def get_id_multirelations_mapping( *, use_tqdm: bool = False, force: bool = False, + version: Optional[str] = None, ) -> Mapping[str, List[Reference]]: """Get the OBO file and output a synonym dictionary.""" - version = get_version(prefix) + if version is None: + version = get_version(prefix) ontology = get_ontology(prefix, force=force, version=version) return ontology.get_id_multirelations_mapping(typedef=typedef, use_tqdm=use_tqdm) @@ -134,6 +138,7 @@ def get_relation_mapping( *, use_tqdm: bool = False, force: bool = False, + version: Optional[str] = None, ) -> Mapping[str, str]: """Get relations from identifiers in the source prefix to target prefix with the given relation. @@ -147,7 +152,8 @@ def get_relation_mapping( >>> hgnc_mgi_orthology_mapping = pyobo.get_relation_mapping('hgnc', 'ro:HOM0000017', 'mgi') >>> assert mouse_mapt_mgi_id == hgnc_mgi_orthology_mapping[human_mapt_hgnc_id] """ - version = get_version(prefix) + if version is None: + version = get_version(prefix) ontology = get_ontology(prefix, force=force, version=version) return ontology.get_relation_mapping( relation=relation, target_prefix=target_prefix, use_tqdm=use_tqdm @@ -163,6 +169,7 @@ def get_relation( *, use_tqdm: bool = False, force: bool = False, + **kwargs, ) -> Optional[str]: """Get the target identifier corresponding to the given relationship from the source prefix/identifier pair. @@ -181,6 +188,7 @@ def get_relation( target_prefix=target_prefix, use_tqdm=use_tqdm, force=force, + **kwargs, ) return relation_mapping.get(source_identifier) diff --git a/src/pyobo/api/species.py b/src/pyobo/api/species.py index e0ffbf1d..5c2d2222 100644 --- a/src/pyobo/api/species.py +++ b/src/pyobo/api/species.py @@ -22,13 +22,13 @@ @wrap_norm_prefix -def get_species(prefix: str, identifier: str) -> Optional[str]: +def get_species(prefix: str, identifier: str, *, version: Optional[str] = None) -> Optional[str]: """Get the species.""" if prefix == "uniprot": raise NotImplementedError try: - id_species = get_id_species_mapping(prefix) + id_species = get_id_species_mapping(prefix, version=version) except NoBuild: logger.warning("unable to look up species for prefix %s", prefix) return None @@ -37,7 +37,7 @@ def get_species(prefix: str, identifier: str) -> Optional[str]: logger.warning("no results produced for prefix %s", prefix) return None - primary_id = get_primary_identifier(prefix, identifier) + primary_id = get_primary_identifier(prefix, identifier, version=version) return id_species.get(primary_id) diff --git a/src/pyobo/api/typedefs.py b/src/pyobo/api/typedefs.py index 39421ff2..6d390672 100644 --- a/src/pyobo/api/typedefs.py +++ b/src/pyobo/api/typedefs.py @@ -4,6 +4,7 @@ import logging from functools import lru_cache +from typing import Optional import pandas as pd @@ -22,9 +23,12 @@ @lru_cache() @wrap_norm_prefix -def get_typedef_df(prefix: str, force: bool = False) -> pd.DataFrame: +def get_typedef_df( + prefix: str, *, force: bool = False, version: Optional[str] = None +) -> pd.DataFrame: """Get an identifier to name mapping for the typedefs in an OBO file.""" - version = get_version(prefix) + if version is None: + version = get_version(prefix) path = prefix_cache_join(prefix, name="typedefs.tsv", version=version) @cached_df(path=path, dtype=str, force=force) diff --git a/src/pyobo/api/xrefs.py b/src/pyobo/api/xrefs.py index e2cb2bec..9bfcd2e0 100644 --- a/src/pyobo/api/xrefs.py +++ b/src/pyobo/api/xrefs.py @@ -30,9 +30,16 @@ @wrap_norm_prefix -def get_xref(prefix: str, identifier: str, new_prefix: str, flip: bool = False) -> Optional[str]: +def get_xref( + prefix: str, + identifier: str, + new_prefix: str, + *, + flip: bool = False, + version: Optional[str] = None, +) -> Optional[str]: """Get the xref with the new prefix if a direct path exists.""" - filtered_xrefs = get_filtered_xrefs(prefix, new_prefix, flip=flip) + filtered_xrefs = get_filtered_xrefs(prefix, new_prefix, flip=flip, version=version) return filtered_xrefs.get(identifier) @@ -41,8 +48,8 @@ def get_xref(prefix: str, identifier: str, new_prefix: str, flip: bool = False) def get_filtered_xrefs( prefix: str, xref_prefix: str, - flip: bool = False, *, + flip: bool = False, use_tqdm: bool = False, force: bool = False, strict: bool = False, diff --git a/src/pyobo/aws.py b/src/pyobo/aws.py index d552782c..ac9471fc 100644 --- a/src/pyobo/aws.py +++ b/src/pyobo/aws.py @@ -77,14 +77,19 @@ def upload_artifacts( upload_artifacts_for_prefix(prefix=prefix, bucket=bucket, s3_client=s3_client) -def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): +def upload_artifacts_for_prefix( + *, prefix: str, bucket: str, s3_client=None, version: Optional[str] = None +): """Upload compiled parts for the given prefix to AWS.""" if s3_client is None: s3_client = boto3.client("s3") + if version is None: + version = get_version(prefix) + logger.info("[%s] getting id->name mapping", prefix) get_id_name_mapping(prefix) - id_name_path = prefix_cache_join(prefix, name="names.tsv", version=get_version(prefix)) + id_name_path = prefix_cache_join(prefix, name="names.tsv", version=version) if not id_name_path.exists(): raise FileNotFoundError id_name_key = os.path.join(prefix, "cache", "names.tsv") @@ -93,7 +98,7 @@ def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): logger.info("[%s] getting id->synonyms mapping", prefix) get_id_synonyms_mapping(prefix) - id_synonyms_path = prefix_cache_join(prefix, name="synonyms.tsv", version=get_version(prefix)) + id_synonyms_path = prefix_cache_join(prefix, name="synonyms.tsv", version=version) if not id_synonyms_path.exists(): raise FileNotFoundError id_synonyms_key = os.path.join(prefix, "cache", "synonyms.tsv") @@ -102,7 +107,7 @@ def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): logger.info("[%s] getting xrefs", prefix) get_xrefs_df(prefix) - xrefs_path = prefix_cache_join(prefix, name="xrefs.tsv", version=get_version(prefix)) + xrefs_path = prefix_cache_join(prefix, name="xrefs.tsv", version=version) if not xrefs_path.exists(): raise FileNotFoundError xrefs_key = os.path.join(prefix, "cache", "xrefs.tsv") @@ -111,7 +116,7 @@ def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): logger.info("[%s] getting relations", prefix) get_relations_df(prefix) - relations_path = prefix_cache_join(prefix, name="relations.tsv", version=get_version(prefix)) + relations_path = prefix_cache_join(prefix, name="relations.tsv", version=version) if not relations_path.exists(): raise FileNotFoundError relations_key = os.path.join(prefix, "cache", "relations.tsv") @@ -120,7 +125,7 @@ def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): logger.info("[%s] getting properties", prefix) get_properties_df(prefix) - properties_path = prefix_cache_join(prefix, name="properties.tsv", version=get_version(prefix)) + properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version) if not properties_path.exists(): raise FileNotFoundError properties_key = os.path.join(prefix, "cache", "properties.tsv") @@ -129,7 +134,7 @@ def upload_artifacts_for_prefix(*, prefix: str, bucket: str, s3_client=None): logger.info("[%s] getting alternative identifiers", prefix) get_id_to_alts(prefix) - alts_path = prefix_cache_join(prefix, name="alt_ids.tsv", version=get_version(prefix)) + alts_path = prefix_cache_join(prefix, name="alt_ids.tsv", version=version) if not alts_path.exists(): raise FileNotFoundError alts_key = os.path.join(prefix, "cache", "alt_ids.tsv") diff --git a/src/pyobo/cli/lookup.py b/src/pyobo/cli/lookup.py index 876300cf..7d4746f0 100644 --- a/src/pyobo/cli/lookup.py +++ b/src/pyobo/cli/lookup.py @@ -76,9 +76,10 @@ def xrefs(prefix: str, target: str, force: bool, no_strict: bool, version: Optio @prefix_argument @verbose_option @force_option -def metadata(prefix: str, force: bool): +@version_option +def metadata(prefix: str, force: bool, version: Optional[str]): """Print the metadata for the given namespace.""" - metadata = get_metadata(prefix, force=force) + metadata = get_metadata(prefix, force=force, version=version) click.echo(json.dumps(metadata, indent=2)) diff --git a/src/pyobo/constants.py b/src/pyobo/constants.py index 6bb939dc..3fd8279d 100644 --- a/src/pyobo/constants.py +++ b/src/pyobo/constants.py @@ -4,17 +4,13 @@ import logging import re -from functools import partial -from typing import Callable -import bioversions import pystow __all__ = [ "RAW_DIRECTORY", "DATABASE_DIRECTORY", "SPECIES_REMAPPING", - "version_getter", ] logger = logging.getLogger(__name__) @@ -85,11 +81,6 @@ SPECIES_FILE = "species.tsv.gz" -def version_getter(name: str) -> Callable[[], str]: - """Make a function appropriate for getting versions.""" - return partial(bioversions.get_version, name) - - NCBITAXON_PREFIX = "NCBITaxon" DATE_FORMAT = "%d:%m:%Y %H:%M" PROVENANCE_PREFIXES = { diff --git a/src/pyobo/sources/antibodyregistry.py b/src/pyobo/sources/antibodyregistry.py index eb269317..df757590 100644 --- a/src/pyobo/sources/antibodyregistry.py +++ b/src/pyobo/sources/antibodyregistry.py @@ -24,9 +24,10 @@ CHUNKSIZE = 20_000 -def get_chunks(force: bool = False) -> pd.DataFrame: +def get_chunks(*, force: bool = False, version: Optional[str] = None) -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" - version = bioversions.get_version(PREFIX) + if version is None: + version = bioversions.get_version(PREFIX) df = ensure_df( PREFIX, url=URL, @@ -47,7 +48,7 @@ class AntibodyRegistryGetter(Obo): def iter_terms(self, force: bool = False) -> Iterable[Term]: """Iterate over terms in the ontology.""" - return iter_terms(force=force) + return iter_terms(force=force, version=self._version_or_raise) def get_obo(*, force: bool = False) -> Obo: @@ -74,9 +75,9 @@ def get_obo(*, force: bool = False) -> Obo: } -def iter_terms(force: bool = False) -> Iterable[Term]: +def iter_terms(*, force: bool = False, version: Optional[str] = None) -> Iterable[Term]: """Iterate over antibodies.""" - chunks = get_chunks(force=force) + chunks = get_chunks(force=force, version=version) needs_curating = set() # df['vendor'] = df['vendor'].map(bioregistry.normalize_prefix) it = tqdm(chunks, desc=f"{PREFIX}, chunkssize={CHUNKSIZE}") diff --git a/src/pyobo/sources/biogrid.py b/src/pyobo/sources/biogrid.py index 5c869742..48440085 100644 --- a/src/pyobo/sources/biogrid.py +++ b/src/pyobo/sources/biogrid.py @@ -2,12 +2,12 @@ """Extract and convert BioGRID identifiers.""" +from functools import partial from typing import Mapping, Optional import bioversions import pandas as pd -from pyobo.constants import version_getter from pyobo.resources.ncbitaxon import get_ncbitaxon_id from pyobo.utils.cache import cached_mapping from pyobo.utils.path import ensure_df, prefix_directory_join @@ -61,7 +61,11 @@ def get_df() -> pd.DataFrame: @cached_mapping( path=prefix_directory_join( - PREFIX, "cache", "xrefs", name="ncbigene.tsv", version=version_getter(PREFIX) + PREFIX, + "cache", + "xrefs", + name="ncbigene.tsv", + version=partial(bioversions.get_version, PREFIX), ), header=["biogrid_id", "ncbigene_id"], ) diff --git a/src/pyobo/sources/ccle.py b/src/pyobo/sources/ccle.py index 066cd8ae..1c0612d7 100644 --- a/src/pyobo/sources/ccle.py +++ b/src/pyobo/sources/ccle.py @@ -50,7 +50,7 @@ def iter_terms(version: Optional[str] = None, force: bool = False) -> Iterable[T yield term -def get_version() -> str: +def get_ccle_static_version() -> str: """Get the default version of CCLE's cell lines.""" return "2019" @@ -58,21 +58,21 @@ def get_version() -> str: def get_url(version: Optional[str] = None) -> str: """Get the cBioPortal URL for the given version of CCLE's cell lines.""" if version is None: - version = get_version() + version = get_ccle_static_version() return f"https://cbioportal-datahub.s3.amazonaws.com/ccle_broad_{version}.tar.gz" def get_inner(version: Optional[str] = None) -> str: """Get the inner tarfile path.""" if version is None: - version = get_version() + version = get_ccle_static_version() return f"ccle_broad_{version}/data_clinical_sample.txt" def ensure(version: Optional[str] = None, **kwargs) -> Path: """Ensure the given version is downloaded.""" if version is None: - version = get_version() + version = get_ccle_static_version() url = get_url(version=version) return pystow.ensure("pyobo", "raw", PREFIX, version, url=url, **kwargs) @@ -80,7 +80,7 @@ def ensure(version: Optional[str] = None, **kwargs) -> Path: def ensure_df(version: Optional[str] = None, force: bool = False) -> pd.DataFrame: """Get the CCLE clinical sample dataframe.""" if version is None: - version = get_version() + version = get_ccle_static_version() path = ensure(version=version, force=force) inner_path = get_inner(version=version) with tarfile.open(path) as tf: diff --git a/src/pyobo/sources/mesh.py b/src/pyobo/sources/mesh.py index 5478ec30..0368e36a 100644 --- a/src/pyobo/sources/mesh.py +++ b/src/pyobo/sources/mesh.py @@ -318,21 +318,23 @@ def _get_descriptor_qualifiers(descriptor: Element) -> List[Mapping[str, str]]: ] -def get_mesh_category_curies(letter: str, skip: Optional[Collection[str]] = None) -> List[str]: +def get_mesh_category_curies( + letter: str, *, skip: Optional[Collection[str]] = None, version: Optional[str] = None +) -> List[str]: """Get the MeSH LUIDs for a category, by letter (e.g., "A"). :param letter: The MeSH tree, A for anatomy, C for disease, etc. :param skip: An optional collection of MeSH tree codes to skip, such as "A03" + :param version: The MeSH version to use. Defaults to latest :returns: A list of MeSH CURIE strings for the top level of each MeSH tree. .. seealso:: https://meshb.nlm.nih.gov/treeView """ - import bioversions + if version is None: + import bioversions - mesh_version = bioversions.get_version("mesh") - if mesh_version is None: - raise ValueError - tree_to_mesh = get_tree_to_mesh_id(mesh_version) + version = bioversions.get_version("mesh") + tree_to_mesh = get_tree_to_mesh_id(version=version) rv = [] for i in range(1, 100): key = f"{letter}{i:02}"