From 0df22de681bbf5f9a3bf11f033e684ab4d8a2b15 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:06:21 +0100 Subject: [PATCH 01/23] Begin cleanup --- pyproject.toml | 4 ++++ src/semra/client.py | 16 +++++++++++----- src/semra/sources/clo.py | 6 +++++- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b221004..25ac2b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ dependencies = [ "black[jupyter]>=23.1.0", "mypy>=1.0.0", "ruff>=0.0.243", + "pydantic", ] [tool.hatch.envs.lint.scripts] typing = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/semra tests}" @@ -105,6 +106,9 @@ all = [ target-version = ["py39"] line-length = 120 +[tool.mypy] +plugins = ["pydantic.mypy"] + [tool.ruff] target-version = "py39" line-length = 120 diff --git a/src/semra/client.py b/src/semra/client.py index badfde0..b029561 100644 --- a/src/semra/client.py +++ b/src/semra/client.py @@ -12,7 +12,7 @@ import neo4j.graph import networkx as nx import pydantic -from neo4j import Transaction, unit_of_work +from neo4j import unit_of_work from typing_extensions import TypeAlias import semra @@ -125,6 +125,8 @@ def _get_node_by_curie(self, curie: ReferenceHint) -> Node: def get_mapping(self, curie: ReferenceHint) -> semra.Mapping: """Get a mapping.""" + if isinstance(curie, Reference): + curie = curie.curie if not curie.startswith("semra.mapping:"): curie = f"semra.mapping:{curie}" query = """\ @@ -175,6 +177,8 @@ def get_mapping_set(self, curie: ReferenceHint) -> MappingSet: For example, use ``semra.mappingset:7831d5bc95698099fb6471667e5282cd`` for biomappings :return: A mapping set """ + if isinstance(curie, Reference): + curie = curie.curie if not curie.startswith("semra.mappingset:"): curie = f"semra.mappingset:{curie}" node = self._get_node_by_curie(curie) @@ -217,7 +221,9 @@ def summarize_nodes(self) -> t.Counter[str]: def summarize_concepts(self) -> t.Counter[tuple[str, str]]: query = "MATCH (e:concept) WHERE e.prefix <> 'orcid' RETURN e.prefix, count(e.prefix)" - return Counter({(prefix, bioregistry.get_name(prefix)): count for prefix, count in self.read_query(query)}) + return Counter( + {(prefix, t.cast(str, bioregistry.get_name(prefix))): count for prefix, count in self.read_query(query)} + ) def summarize_authors(self) -> t.Counter[tuple[str, str]]: query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, a.name, count(e)" @@ -257,8 +263,8 @@ def get_connected_component_graph(self, curie: str) -> nx.MultiDiGraph: g.add_node(node["curie"], **node) for relation in relations: g.add_edge( - relation.nodes[0]["curie"], - relation.nodes[1]["curie"], + relation.nodes[0]["curie"], # type: ignore + relation.nodes[1]["curie"], # type: ignore key=relation.element_id, type=relation.type, **relation, @@ -273,6 +279,6 @@ def get_concept_name(self, curie: str) -> str | None: # https://neo4j.com/docs/python-manual/current/session-api/#python-driver-simple-transaction-fn # and from the docstring of neo4j.Session.read_transaction @unit_of_work() -def do_cypher_tx(tx: Transaction, query: str, **query_params) -> list[list]: +def do_cypher_tx(tx, query, **query_params) -> list[list]: result = tx.run(query, parameters=query_params) return [record.values() for record in result] diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index 8fafa28..db66ced 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -1,5 +1,7 @@ """Process mappings from CLO.""" +from typing import Optional + import bioontologies import bioregistry import click @@ -36,6 +38,8 @@ def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: continue for raw_curie in _split(p.value_raw): curie = raw_curie.removeprefix("rrid:").removeprefix("RRID:") + prefix: Optional[str] + identifier: Optional[str] if curie.startswith("Sanger:COSMICID:"): prefix, identifier = "cosmic.cell", curie.removeprefix("Sanger:COSMICID:") elif curie.startswith("atcc:COSMICID:"): @@ -83,7 +87,7 @@ def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: else: prefix, identifier = bioregistry.parse_curie(curie) - if prefix is None: + if prefix is None or identifier is None: tqdm.write(f"CLO:{clo_id} unparsed: {click.style(curie, fg='red')} from line:\n {p.value_raw}") continue if prefix in SKIP_PREFIXES: From 179118c78af7a20498540e70bdabee7dc29d757d Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:13:43 +0100 Subject: [PATCH 02/23] Update typing --- src/semra/api.py | 80 ++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/src/semra/api.py b/src/semra/api.py index f2091b5..747c6a6 100644 --- a/src/semra/api.py +++ b/src/semra/api.py @@ -4,6 +4,7 @@ import itertools as itt import logging +import typing as t from collections import Counter, defaultdict from collections.abc import Iterable from typing import cast @@ -22,7 +23,14 @@ KNOWLEDGE_MAPPING, NARROW_MATCH, ) -from semra.struct import Evidence, Mapping, ReasonedEvidence, Reference, Triple, triple_key +from semra.struct import ( + Evidence, + Mapping, + ReasonedEvidence, + Reference, + Triple, + triple_key, +) logger = logging.getLogger(__name__) @@ -30,7 +38,7 @@ EVIDENCE_KEY = "evidence" #: An index allows for the aggregation of evidences for each core triple -Index = dict[Triple, list[Evidence]] +Index = t.Dict[Triple, t.List[Evidence]] def _tqdm(mappings: Iterable[Mapping], desc: str | None = None, *, progress: bool = True): @@ -43,7 +51,7 @@ def _tqdm(mappings: Iterable[Mapping], desc: str | None = None, *, progress: boo ) -def count_source_target(mappings: Iterable[Mapping]) -> Counter[tuple[str, str]]: +def count_source_target(mappings: Iterable[Mapping]) -> Counter[t.Tuple[str, str]]: """Count source prefix-target prefix pairs.""" return Counter((s.prefix, o.prefix) for s, _, o in get_index(mappings)) @@ -65,18 +73,18 @@ def print_source_target_counts(mappings: Iterable[Mapping], minimum: int = 0) -> def get_index(mappings: Iterable[Mapping], *, progress: bool = True) -> Index: """Aggregate and deduplicate evidences for each core triple.""" - dd: defaultdict[Triple, list[Evidence]] = defaultdict(list) + dd: t.DefaultDict[Triple, t.List[Evidence]] = defaultdict(list) for mapping in _tqdm(mappings, desc="Indexing mappings", progress=progress): dd[mapping.triple].extend(mapping.evidence) return {triple: deduplicate_evidence(evidence) for triple, evidence in dd.items()} -def assemble_evidences(mappings: list[Mapping], *, progress: bool = True) -> list[Mapping]: +def assemble_evidences(mappings: t.List[Mapping], *, progress: bool = True) -> t.List[Mapping]: index = get_index(mappings, progress=progress) return unindex(index, progress=progress) -def infer_reversible(mappings: list[Mapping], *, progress: bool = True) -> list[Mapping]: +def infer_reversible(mappings: t.List[Mapping], *, progress: bool = True) -> t.List[Mapping]: rv = [] for mapping in _tqdm(mappings, desc="Infer reverse", progress=progress): rv.append(mapping) @@ -111,7 +119,7 @@ def flip(mapping: Mapping) -> Mapping | None: ) -def to_graph(mappings: list[Mapping]) -> nx.DiGraph: +def to_graph(mappings: t.List[Mapping]) -> nx.DiGraph: """Convert mappings into a directed graph data model.""" graph = nx.DiGraph() for mapping in mappings: @@ -123,7 +131,7 @@ def to_graph(mappings: list[Mapping]) -> nx.DiGraph: return graph -def from_graph(graph: nx.DiGraph) -> list[Mapping]: +def from_graph(graph: nx.DiGraph) -> t.List[Mapping]: """Extract mappings from a directed graph data model.""" return [_from_edge(graph, s, o) for s, o in graph.edges()] @@ -133,7 +141,7 @@ def _from_edge(graph: nx.DiGraph, s: Reference, o: Reference) -> Mapping: return Mapping(s=s, p=data[PREDICATE_KEY], o=o, evidence=data[EVIDENCE_KEY]) -def _condense_predicates(predicates: list[Reference]) -> Reference | None: +def _condense_predicates(predicates: t.List[Reference]) -> Reference | None: predicate_set = set(predicates) if predicate_set == {EXACT_MATCH}: return EXACT_MATCH @@ -145,8 +153,8 @@ def _condense_predicates(predicates: list[Reference]) -> Reference | None: def infer_chains( - mappings: list[Mapping], *, backwards: bool = True, progress: bool = True, cutoff: int = 5 -) -> list[Mapping]: + mappings: t.List[Mapping], *, backwards: bool = True, progress: bool = True, cutoff: int = 5 +) -> t.List[Mapping]: """Apply graph-based reasoning over mapping chains to infer new mappings. :param mappings: A list of input mappings @@ -198,7 +206,7 @@ def tabulate_index(index: Index) -> str: """Tabulate""" from tabulate import tabulate - rows: list[tuple[str, str, str, str]] = [] + rows: t.List[t.Tuple[str, str, str, str]] = [] def key(pair): return triple_key(pair[0]) @@ -218,16 +226,16 @@ def infer_mutual_dbxref_mutations( mappings: Iterable[Mapping], prefixes: set[str], confidence: float | None = None, -) -> list[Mapping]: +) -> t.List[Mapping]: pairs = {(s, t) for s, t in itt.product(prefixes, repeat=2) if s != t} return infer_dbxref_mutations(mappings, pairs=pairs, confidence=confidence) def infer_dbxref_mutations( mappings: Iterable[Mapping], - pairs: dict[tuple[str, str], float] | Iterable[tuple[str, str]], + pairs: t.Dict[t.Tuple[str, str], float] | Iterable[t.Tuple[str, str]], confidence: float | None = None, -) -> list[Mapping]: +) -> t.List[Mapping]: """Upgrade database cross-references into exact matches for the given pairs. :param mappings: A list of mappings @@ -249,12 +257,12 @@ def infer_dbxref_mutations( def infer_mutations( mappings: Iterable[Mapping], - pairs: dict[tuple[str, str], float], + pairs: t.Dict[t.Tuple[str, str], float], old: Reference, new: Reference, *, progress: bool = False, -) -> list[Mapping]: +) -> t.List[Mapping]: """Infer mappings with alternate predicates for the given prefix pairs. :param mappings: Mappings to infer from @@ -286,7 +294,7 @@ def infer_mutations( return rv -def keep_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str], *, progress: bool = True) -> list[Mapping]: +def keep_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str], *, progress: bool = True) -> t.List[Mapping]: """Filter out mappings whose subject or object are not in the given list of prefixes.""" prefixes = set(prefixes) return [ @@ -314,7 +322,7 @@ def keep_object_prefixes(mappings: Iterable[Mapping], prefixes: str | Iterable[s ] -def filter_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str], *, progress: bool = True) -> list[Mapping]: +def filter_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str], *, progress: bool = True) -> t.List[Mapping]: """Filter out mappings whose subject or object are in the given list of prefixes.""" prefixes = set(prefixes) return [ @@ -324,7 +332,7 @@ def filter_prefixes(mappings: Iterable[Mapping], prefixes: Iterable[str], *, pro ] -def filter_self_matches(mappings: Iterable[Mapping], *, progress: bool = True) -> list[Mapping]: +def filter_self_matches(mappings: Iterable[Mapping], *, progress: bool = True) -> t.List[Mapping]: """Filter out mappings within the same resource.""" return [ mapping @@ -333,7 +341,7 @@ def filter_self_matches(mappings: Iterable[Mapping], *, progress: bool = True) - ] -def filter_mappings(mappings: list[Mapping], skip_mappings: list[Mapping], *, progress: bool = True) -> list[Mapping]: +def filter_mappings(mappings: t.List[Mapping], skip_mappings: t.List[Mapping], *, progress: bool = True) -> t.List[Mapping]: """Filter out mappings in the second set from the first set.""" skip_triples = {skip_mapping.triple for skip_mapping in skip_mappings} return [ @@ -343,10 +351,10 @@ def filter_mappings(mappings: list[Mapping], skip_mappings: list[Mapping], *, pr ] -M2MIndex = defaultdict[tuple[str, str], defaultdict[str, defaultdict[str, list[Mapping]]]] +M2MIndex = t.DefaultDict[t.Tuple[str, str], t.DefaultDict[str, t.DefaultDict[str, t.List[Mapping]]]] -def get_many_to_many(mappings: list[Mapping]) -> list[Mapping]: +def get_many_to_many(mappings: t.List[Mapping]) -> t.List[Mapping]: """Get many-to-many mappings, disregarding predicate type.""" forward: M2MIndex = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) backward: M2MIndex = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) @@ -354,7 +362,7 @@ def get_many_to_many(mappings: list[Mapping]) -> list[Mapping]: forward[mapping.s.prefix, mapping.o.prefix][mapping.s.identifier][mapping.o.identifier].append(mapping) backward[mapping.s.prefix, mapping.o.prefix][mapping.o.identifier][mapping.s.identifier].append(mapping) - index: defaultdict[Triple, list[Evidence]] = defaultdict(list) + index: t.DefaultDict[Triple, t.List[Evidence]] = defaultdict(list) for preindex in [forward, backward]: for d1 in preindex.values(): for d2 in d1.values(): @@ -366,15 +374,15 @@ def get_many_to_many(mappings: list[Mapping]) -> list[Mapping]: return rv -def filter_many_to_many(mappings: list[Mapping], *, progress: bool = True) -> list[Mapping]: +def filter_many_to_many(mappings: t.List[Mapping], *, progress: bool = True) -> t.List[Mapping]: """Filter out many to many mappings.""" skip_mappings = get_many_to_many(mappings) return filter_mappings(mappings, skip_mappings, progress=progress) def project( - mappings: list[Mapping], source_prefix: str, target_prefix: str, *, return_sus: bool = False, progress: bool = False -) -> list[Mapping] | tuple[list[Mapping], list[Mapping]]: + mappings: t.List[Mapping], source_prefix: str, target_prefix: str, *, return_sus: bool = False, progress: bool = False +) -> t.List[Mapping] | t.Tuple[t.List[Mapping], t.List[Mapping]]: """Ensure that each identifier only appears as the subject of one mapping.""" mappings = keep_subject_prefixes(mappings, source_prefix, progress=progress) mappings = keep_object_prefixes(mappings, target_prefix, progress=progress) @@ -386,13 +394,13 @@ def project( return mappings -def project_dict(mappings: list[Mapping], source_prefix: str, target_prefix: str) -> dict[str, str]: +def project_dict(mappings: t.List[Mapping], source_prefix: str, target_prefix: str) -> t.Dict[str, str]: """Get a dictionary from source identifiers to target identifiers.""" - mappings = cast(list[Mapping], project(mappings, source_prefix, target_prefix)) + mappings = cast(t.List[Mapping], project(mappings, source_prefix, target_prefix)) return {mapping.s.identifier: mapping.o.identifier for mapping in mappings} -def prioritize(mappings: list[Mapping], priority: list[str]) -> list[Mapping]: +def prioritize(mappings: t.List[Mapping], priority: t.List[str]) -> t.List[Mapping]: """Get a priority star graph. :param mappings: @@ -403,7 +411,7 @@ def prioritize(mappings: list[Mapping], priority: list[str]) -> list[Mapping]: exact_mappings = len(mappings) graph = to_graph(mappings).to_undirected() - rv: list[Mapping] = [] + rv: t.List[Mapping] = [] for component in tqdm(nx.connected_components(graph), unit="component", unit_scale=True): o = _get_priority(component, priority) if o is None: @@ -427,7 +435,7 @@ def prioritize(mappings: list[Mapping], priority: list[str]) -> list[Mapping]: return rv -def _get_priority(component: list[Reference], priority: list[str]) -> Reference | None: +def _get_priority(component: t.List[Reference], priority: t.List[str]) -> t.Optional[Reference]: prefix_to_references = defaultdict(list) for c in component: prefix_to_references[c.prefix].append(c) @@ -444,7 +452,7 @@ def _get_priority(component: list[Reference], priority: list[str]) -> Reference return None -def unindex(index: Index, *, progress: bool = True) -> list[Mapping]: +def unindex(index: Index, *, progress: bool = True) -> t.List[Mapping]: """Convert a mapping index into a list of mapping objects.""" return [ Mapping.from_triple(triple, evidence=evidence) @@ -454,13 +462,13 @@ def unindex(index: Index, *, progress: bool = True) -> list[Mapping]: ] -def deduplicate_evidence(evidence: list[Evidence]) -> list[Evidence]: +def deduplicate_evidence(evidence: t.List[Evidence]) -> t.List[Evidence]: """Deduplicate a list of evidences based on their "key" function.""" d = {e.key(): e for e in evidence} return list(d.values()) -def validate_mappings(mappings: list[Mapping], *, progress: bool = True) -> None: +def validate_mappings(mappings: t.List[Mapping], *, progress: bool = True) -> None: """Validate mappings against the Bioregistry and raise an error on the first invalid.""" import bioregistry @@ -489,7 +497,7 @@ def validate_mappings(mappings: list[Mapping], *, progress: bool = True) -> None raise ValueError(f"banana in mapping object: {mapping}") -def summarize_prefixes(mappings: list[Mapping]) -> pd.DataFrame: +def summarize_prefixes(mappings: t.List[Mapping]) -> pd.DataFrame: """Get a dataframe summarizing the prefixes appearing in the mappings.""" import bioregistry From d8c8dbe39ec265ac637562888b67567caacc0cdb Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:13:50 +0100 Subject: [PATCH 03/23] Update api.py --- src/semra/api.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/semra/api.py b/src/semra/api.py index 747c6a6..eb2cf28 100644 --- a/src/semra/api.py +++ b/src/semra/api.py @@ -341,7 +341,9 @@ def filter_self_matches(mappings: Iterable[Mapping], *, progress: bool = True) - ] -def filter_mappings(mappings: t.List[Mapping], skip_mappings: t.List[Mapping], *, progress: bool = True) -> t.List[Mapping]: +def filter_mappings( + mappings: t.List[Mapping], skip_mappings: t.List[Mapping], *, progress: bool = True +) -> t.List[Mapping]: """Filter out mappings in the second set from the first set.""" skip_triples = {skip_mapping.triple for skip_mapping in skip_mappings} return [ @@ -381,7 +383,12 @@ def filter_many_to_many(mappings: t.List[Mapping], *, progress: bool = True) -> def project( - mappings: t.List[Mapping], source_prefix: str, target_prefix: str, *, return_sus: bool = False, progress: bool = False + mappings: t.List[Mapping], + source_prefix: str, + target_prefix: str, + *, + return_sus: bool = False, + progress: bool = False, ) -> t.List[Mapping] | t.Tuple[t.List[Mapping], t.List[Mapping]]: """Ensure that each identifier only appears as the subject of one mapping.""" mappings = keep_subject_prefixes(mappings, source_prefix, progress=progress) From 9a544121f3030fe9a09ceb411ab1a607638f3b20 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:18:43 +0100 Subject: [PATCH 04/23] More typing --- src/semra/struct.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/semra/struct.py b/src/semra/struct.py index a4e60c5..07f7be4 100644 --- a/src/semra/struct.py +++ b/src/semra/struct.py @@ -4,17 +4,19 @@ import math import pickle +import typing as t import uuid from collections.abc import Iterable from hashlib import md5 from itertools import islice -from typing import Annotated, ClassVar, Literal, Optional, Union +from typing import ClassVar, Literal, Optional, Union import pydantic from curies import Reference from more_itertools import triplewise from pydantic import Field from pydantic.types import UUID4 +from typing_extensions import Annotated __all__ = [ "Reference", @@ -29,10 +31,10 @@ ] #: A type annotation for a subject-predicate-object triple -Triple = tuple[Reference, Reference, Reference] +Triple = t.Tuple[Reference, Reference, Reference] -def triple_key(triple: Triple) -> tuple[str, str, str]: +def triple_key(triple: Triple) -> t.Tuple[str, str, str]: """Get a sortable key for a triple.""" return triple[0].curie, triple[2].curie, triple[1].curie @@ -148,7 +150,7 @@ def key(self): return self.evidence_type, self.justification, self.author, self.mapping_set.key(), self.uuid @property - def mapping_set_names(self) -> set[str]: + def mapping_set_names(self) -> t.Set[str]: return {self.mapping_set.name} def get_confidence(self) -> float: @@ -165,7 +167,7 @@ class Config: evidence_type: Literal["reasoned"] = Field(default="reasoned") justification: Reference = Field(..., description="A SSSOM-compliant justification") - mappings: list[Mapping] = Field( + mappings: t.List[Mapping] = Field( ..., description="A list of mappings and their evidences consumed to create this evidence" ) author: Optional[Reference] = None @@ -187,9 +189,12 @@ def mapping_set(self) -> None: return None @property - def mapping_set_names(self) -> set[str]: + def mapping_set_names(self) -> t.Set[str]: return { - name for mapping in self.mappings for evidence in mapping.evidence for name in evidence.mapping_set_names + name + for mapping in self.mappings + for evidence in mapping.evidence + for name in evidence.mapping_set_names # type:ignore } @property @@ -214,7 +219,7 @@ class Config: s: Reference = Field(..., title="subject") p: Reference = Field(..., title="predicate") o: Reference = Field(..., title="object") - evidence: list[Evidence] = Field(default_factory=list) + evidence: t.List[Evidence] = Field(default_factory=list) @property def triple(self) -> Triple: @@ -222,7 +227,7 @@ def triple(self) -> Triple: return self.s, self.p, self.o @classmethod - def from_triple(cls, triple: Triple, evidence: Optional[list[Evidence]] = None) -> Mapping: + def from_triple(cls, triple: Triple, evidence: Optional[t.List[Evidence]] = None) -> Mapping: """Instantiate a mapping from a triple.""" s, p, o = triple return cls(s=s, p=p, o=o, evidence=evidence or []) @@ -255,7 +260,7 @@ def has_tertiary(self) -> bool: return any(not isinstance(evidence, SimpleEvidence) for evidence in self.evidence) -def line(*references: Reference) -> list[Mapping]: +def line(*references: Reference) -> t.List[Mapping]: """Create a list of mappings from a simple mappings path.""" if not (3 <= len(references) and len(references) % 2): # noqa:PLR2004 raise ValueError From 405d509d6c2ac604fde634a0d68c4e04cec26f29 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:22:51 +0100 Subject: [PATCH 05/23] Add future --- src/semra/rules.py | 2 ++ src/semra/sources/chembl.py | 3 +++ src/semra/sources/clo.py | 2 ++ src/semra/sources/famplex.py | 2 ++ src/semra/sources/intact.py | 2 ++ src/semra/sources/ncit.py | 1 + src/semra/sources/pubchem.py | 2 ++ 7 files changed, 14 insertions(+) diff --git a/src/semra/rules.py b/src/semra/rules.py index 4f9785e..51bc7c3 100644 --- a/src/semra/rules.py +++ b/src/semra/rules.py @@ -1,5 +1,7 @@ """Constants and rules for inference.""" +from __future__ import annotations + from semra.struct import Reference EXACT_MATCH = Reference(prefix="skos", identifier="exactMatch") diff --git a/src/semra/sources/chembl.py b/src/semra/sources/chembl.py index f4fec39..c6228ba 100644 --- a/src/semra/sources/chembl.py +++ b/src/semra/sources/chembl.py @@ -1,4 +1,7 @@ """Get mappings from ChEMBL.""" + +from __future__ import annotations + from typing import Optional import bioregistry diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index db66ced..3b4ae8f 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -1,5 +1,7 @@ """Process mappings from CLO.""" +from __future__ import annotations + from typing import Optional import bioontologies diff --git a/src/semra/sources/famplex.py b/src/semra/sources/famplex.py index 0f8091a..e0f0155 100644 --- a/src/semra/sources/famplex.py +++ b/src/semra/sources/famplex.py @@ -1,5 +1,7 @@ """Get mappings from FamPlex.""" +from __future__ import annotations + import logging import bioregistry diff --git a/src/semra/sources/intact.py b/src/semra/sources/intact.py index d200689..4e01bf2 100644 --- a/src/semra/sources/intact.py +++ b/src/semra/sources/intact.py @@ -1,5 +1,7 @@ """Get mappings from IntAct.""" +from __future__ import annotations + import bioregistry import bioversions import pandas as pd diff --git a/src/semra/sources/ncit.py b/src/semra/sources/ncit.py index ec624a5..df364a6 100644 --- a/src/semra/sources/ncit.py +++ b/src/semra/sources/ncit.py @@ -1,4 +1,5 @@ """Get mappings from NCIT.""" + from __future__ import annotations from functools import lru_cache diff --git a/src/semra/sources/pubchem.py b/src/semra/sources/pubchem.py index 135cf8e..a5d06ca 100644 --- a/src/semra/sources/pubchem.py +++ b/src/semra/sources/pubchem.py @@ -1,5 +1,7 @@ """Get mappings from PubChem.""" +from __future__ import annotations + import logging from typing import Optional From dad08701939223ecc517c0447fa8a6beec4c323d Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:23:24 +0100 Subject: [PATCH 06/23] Update io.py --- src/semra/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semra/io.py b/src/semra/io.py index 4e1bada..1c2f141 100644 --- a/src/semra/io.py +++ b/src/semra/io.py @@ -367,7 +367,7 @@ def _get_name_by_curie(curie: str) -> str | None: if curie.startswith("orcid:"): import requests - orcid = curie.removeprefix("orcid:") + orcid = curie[len("orcid:") :] res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}, timeout=5).json() return res["person"]["name"]["given-names"]["value"] + " " + res["person"]["name"]["family-name"]["value"] return pyobo.get_name_by_curie(curie) From f557ac93af6d0df98cbce622d7da2d4ae2ef3110 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:24:36 +0100 Subject: [PATCH 07/23] Clean --- src/semra/client.py | 2 +- src/semra/sources/clo.py | 2 +- src/semra/wsgi.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/semra/client.py b/src/semra/client.py index b029561..f5a0fe7 100644 --- a/src/semra/client.py +++ b/src/semra/client.py @@ -26,7 +26,7 @@ Node: TypeAlias = t.Mapping[str, Any] -TxResult: TypeAlias = t.Optional[list[list[Any]]] +TxResult: TypeAlias = t.Optional[t.List[t.List[Any]]] ReferenceHint: TypeAlias = t.Union[str, Reference] diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index 3b4ae8f..5f8cd93 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -34,7 +34,7 @@ def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: for node in tqdm(graph.nodes, unit_scale=True, unit="node"): if not node.id.startswith(CLO_URI_PREFIX): continue - clo_id = node.id.removeprefix(CLO_URI_PREFIX) + clo_id = node.id[len(CLO_URI_PREFIX) :] for p in node.properties or []: if p.predicate_raw != "http://www.w3.org/2000/01/rdf-schema#seeAlso": continue diff --git a/src/semra/wsgi.py b/src/semra/wsgi.py index f34a8ef..cf7200b 100644 --- a/src/semra/wsgi.py +++ b/src/semra/wsgi.py @@ -1,5 +1,7 @@ """Run the app.""" +from __future__ import annotations + import os import fastapi From 8552c1e65ee4d41268ad72ce0cc9b53e3c4c37a9 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:26:15 +0100 Subject: [PATCH 08/23] Update pyproject.toml --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 25ac2b0..358fe4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -150,6 +150,8 @@ ignore = [ "EM102", "EM101", # Ignore pickle security warnings "S301", + # Ignore upgrading type annotations + "UP006", "UP007", "UP035", ] unfixable = [ # Don't touch unused imports From df9ca40569a2f6bdce961104a96b8022cfc3f579 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:26:50 +0100 Subject: [PATCH 09/23] Update tests.yml --- .github/workflows/tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1c70cb3..3ae09ff 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -23,7 +23,10 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - pip install tox + pip install tox hatch + - name: Test linting + run: + hatch run lint:style - name: Test with mypy run: tox -e mypy From d63f49da89261229bf63cd922095721e585987b2 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:28:27 +0100 Subject: [PATCH 10/23] Update pyproject.toml --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 358fe4c..dfc64e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,6 +152,8 @@ ignore = [ "S301", # Ignore upgrading type annotations "UP006", "UP007", "UP035", + # Ignore shadowing python builtins (because we use 'license') + "A001", "A002", "A003", ] unfixable = [ # Don't touch unused imports From abf5f876ef577243691656b862316aaf2c059d37 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:29:29 +0100 Subject: [PATCH 11/23] Update wsgi.py --- src/semra/wsgi.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/semra/wsgi.py b/src/semra/wsgi.py index cf7200b..c6e855e 100644 --- a/src/semra/wsgi.py +++ b/src/semra/wsgi.py @@ -155,13 +155,13 @@ def view_mapping_set(curie: str): @api_router.get("/evidence/{curie}", response_model=Evidence) -def get_evidence(curie: str = Path(description="An evidence's MD5 hex digest.")): # noqa:B008 +def get_evidence(curie: str = Path(description="An evidence's MD5 hex digest.")): return client.get_evidence(curie) @api_router.get("/cytoscape/{curie}") def get_concept_cytoscape( - curie: str = Path(description="the compact URI (CURIE) for a concept", examples=EXAMPLE_CONCEPTS) # noqa:B008 + curie: str = Path(description="the compact URI (CURIE) for a concept", examples=EXAMPLE_CONCEPTS) ): """Get the mapping graph surrounding the concept as a Cytoscape.js JSON object.""" graph = client.get_connected_component_graph(curie) @@ -170,18 +170,13 @@ def get_concept_cytoscape( @api_router.get("/mapping/{mapping}", response_model=Mapping) -def get_mapping( - mapping: str = Path( # noqa:B008 - description="A mapping's MD5 hex digest.", - examples=EXAMPLE_MAPPINGS, - ) -): +def get_mapping(mapping: str = Path(description="A mapping's MD5 hex digest.", examples=EXAMPLE_MAPPINGS)): return client.get_mapping(mapping) @api_router.get("/mapping_set/{mapping_set}", response_model=MappingSet) def get_mapping_set( - mapping_set: str = Path( # noqa:B008 + mapping_set: str = Path( description="A mapping set's MD5 hex digest.", examples=["7831d5bc95698099fb6471667e5282cd"] ) ): From 94e67a6c9ac5a921fce0cb466b10cb2f063e5e77 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:32:43 +0100 Subject: [PATCH 12/23] More cleanup --- src/semra/sources/clo.py | 5 ++++- src/semra/wsgi.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index 5f8cd93..7613f15 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -39,7 +39,10 @@ def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: if p.predicate_raw != "http://www.w3.org/2000/01/rdf-schema#seeAlso": continue for raw_curie in _split(p.value_raw): - curie = raw_curie.removeprefix("rrid:").removeprefix("RRID:") + if raw_curie.lower().startswith("rrid:"): + curie = raw_curie[len("rrid:"): ] + else: + curie = raw_curie prefix: Optional[str] identifier: Optional[str] if curie.startswith("Sanger:COSMICID:"): diff --git a/src/semra/wsgi.py b/src/semra/wsgi.py index c6e855e..3ccbb67 100644 --- a/src/semra/wsgi.py +++ b/src/semra/wsgi.py @@ -3,6 +3,7 @@ from __future__ import annotations import os +import typing as t import fastapi import flask @@ -183,7 +184,7 @@ def get_mapping_set( return client.get_mapping_set(mapping_set) -@api_router.get("/mapping_set/", response_model=list[MappingSet]) +@api_router.get("/mapping_set/", response_model=t.List[MappingSet]) def get_mapping_sets(): return client.get_mapping_sets() From b690a736afa858cf8111578f5ffc288896fbc9cf Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:34:02 +0100 Subject: [PATCH 13/23] Update clo.py --- src/semra/sources/clo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index 7613f15..c4a775f 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -40,7 +40,7 @@ def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: continue for raw_curie in _split(p.value_raw): if raw_curie.lower().startswith("rrid:"): - curie = raw_curie[len("rrid:"): ] + curie = raw_curie[len("rrid:") :] else: curie = raw_curie prefix: Optional[str] From d5b06f62966f08b552a8c4e25b0c1290f7f64116 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:35:34 +0100 Subject: [PATCH 14/23] Update clo.py --- src/semra/sources/clo.py | 49 +++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index c4a775f..3768bb5 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -21,6 +21,12 @@ def _split(s: str) -> list[str]: return [p2.replace(" ", "").rstrip(")") for p1 in s.strip().split(";") for p2 in p1.strip().split(",")] +def _removeprefix(s, prefix): + if s.startswith(prefix): + return s[len(prefix) :] + return s + + def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: graph = bioontologies.get_obograph_by_prefix("clo", check=False).guess("clo") mapping_set = MappingSet( @@ -39,56 +45,53 @@ def get_clo_mappings(confidence: float = 0.8) -> list[Mapping]: if p.predicate_raw != "http://www.w3.org/2000/01/rdf-schema#seeAlso": continue for raw_curie in _split(p.value_raw): - if raw_curie.lower().startswith("rrid:"): - curie = raw_curie[len("rrid:") :] - else: - curie = raw_curie + curie = _removeprefix(_removeprefix(raw_curie, "rrid:"), "RRID") prefix: Optional[str] identifier: Optional[str] if curie.startswith("Sanger:COSMICID:"): - prefix, identifier = "cosmic.cell", curie.removeprefix("Sanger:COSMICID:") + prefix, identifier = "cosmic.cell", _removeprefix(curie, "Sanger:COSMICID:") elif curie.startswith("atcc:COSMICID:"): - prefix, identifier = "cosmic.cell", curie.removeprefix("atcc:COSMICID:") + prefix, identifier = "cosmic.cell", _removeprefix(curie, "atcc:COSMICID:") elif curie.startswith("DSMZ:COSMICID:"): - prefix, identifier = "cosmic.cell", curie.removeprefix("DSMZ:COSMICID:") + prefix, identifier = "cosmic.cell", _removeprefix(curie, "DSMZ:COSMICID:") elif curie.startswith("COSMIC: COSMIC ID:"): - prefix, identifier = "cosmic.cell", curie.removeprefix("COSMIC: COSMIC ID:") + prefix, identifier = "cosmic.cell", _removeprefix(curie, "COSMIC: COSMIC ID:") elif curie.startswith("RIKEN:COSMICID:"): - prefix, identifier = "cosmic.cell", curie.removeprefix("RIKEN:COSMICID:") + prefix, identifier = "cosmic.cell", _removeprefix(curie, "RIKEN:COSMICID:") elif curie.startswith("COSMICID:"): - prefix, identifier = "cosmic.cell", curie.removeprefix("COSMICID:") + prefix, identifier = "cosmic.cell", _removeprefix(curie, "COSMICID:") elif curie.startswith("LINCS_HMS:"): - prefix, identifier = "hms.lincs.cell", curie.removeprefix("LINCS_HMS:") + prefix, identifier = "hms.lincs.cell", _removeprefix(curie, "LINCS_HMS:") elif curie.startswith("CHEMBL:"): - prefix, identifier = "chembl.cell", curie.removeprefix("CHEMBL:") + prefix, identifier = "chembl.cell", _removeprefix(curie, "CHEMBL:") elif curie.startswith("ChEMBL:"): - prefix, identifier = "chembl.cell", curie.removeprefix("ChEMBL:") + prefix, identifier = "chembl.cell", _removeprefix(curie, "ChEMBL:") elif curie.startswith("BTO_"): - prefix, identifier = "bto", curie.removeprefix("BTO_") + prefix, identifier = "bto", _removeprefix(curie, "BTO_") elif curie.startswith("CVCL_"): - prefix, identifier = "cellosaurus", curie.removeprefix("CVCL_") + prefix, identifier = "cellosaurus", _removeprefix(curie, "CVCL_") elif curie.startswith("JHSF:"): - prefix, identifier = "jcrb", curie.removeprefix("JHSF:") + prefix, identifier = "jcrb", _removeprefix(curie, "JHSF:") elif curie.startswith("CRL-"): prefix, identifier = "atcc", curie elif curie.startswith("jcrb:JHSF:"): - prefix, identifier = "jcrb", curie.removeprefix("jcrb:JHSF:") + prefix, identifier = "jcrb", _removeprefix(curie, "jcrb:JHSF:") elif curie.startswith("JCRB"): prefix, identifier = "jcrb", curie elif curie.startswith("JHSF:JCRB"): - prefix, identifier = "jcrb", curie.removeprefix("JHSF:") + prefix, identifier = "jcrb", _removeprefix(curie, "JHSF:") elif curie.startswith("ATCCCRL"): - prefix, identifier = "atcc", curie.removeprefix("ATCC") + prefix, identifier = "atcc", _removeprefix(curie, "ATCC") elif curie.startswith("bto:BAO_"): - prefix, identifier = "bao", curie.removeprefix("bto:BAO_") + prefix, identifier = "bao", _removeprefix(curie, "bto:BAO_") elif curie.startswith("ACC"): prefix, identifier = "dsmz", curie elif curie.startswith("DSMZACC"): - prefix, identifier = "dsmz", curie.removeprefix("DSMZ") + prefix, identifier = "dsmz", _removeprefix(curie, "DSMZ") elif curie.startswith("dsmz:ACC"): - prefix, identifier = "dsmz", "ACC-" + curie.removeprefix("dsmz:ACC") + prefix, identifier = "dsmz", "ACC-" + _removeprefix(curie, "dsmz:ACC") elif curie.startswith("DSMZ:ACC"): - prefix, identifier = "dsmz", "ACC-" + curie.removeprefix("DSMZ:ACC") + prefix, identifier = "dsmz", "ACC-" + _removeprefix(curie, "DSMZ:ACC") else: prefix, identifier = bioregistry.parse_curie(curie) From 922c80c2052437d4793b57245c058985929efb99 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:37:36 +0100 Subject: [PATCH 15/23] Up --- pyproject.toml | 3 ++- src/semra/sources/clo.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dfc64e1..b243526 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "semra" dynamic = ["version"] description = 'Semantic Mapping Reasoning Assembler' readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.8" license = "MIT" keywords = [] authors = [ @@ -16,6 +16,7 @@ authors = [ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python", + "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", diff --git a/src/semra/sources/clo.py b/src/semra/sources/clo.py index 3768bb5..7cd41b9 100644 --- a/src/semra/sources/clo.py +++ b/src/semra/sources/clo.py @@ -21,7 +21,7 @@ def _split(s: str) -> list[str]: return [p2.replace(" ", "").rstrip(")") for p1 in s.strip().split(";") for p2 in p1.strip().split(",")] -def _removeprefix(s, prefix): +def _removeprefix(s: str, prefix: str) -> str: if s.startswith(prefix): return s[len(prefix) :] return s From e1253150aac2545b7eb601ac0ef9b0164d83d063 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 15:43:43 +0100 Subject: [PATCH 16/23] Update pyproject.toml --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b243526..d00d28b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "bioontologies", "pyobo", "typing_extensions", + "rdflib", # remove after https://github.com/biopragmatics/bioregistry/pull/1030 is released ] [project.optional-dependencies] From 6778c5977b7573f136b2400bd00f18f5e66d1152 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 16:12:34 +0100 Subject: [PATCH 17/23] Update test_pipeline.py --- tests/test_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index be606fe..faf7bd8 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,6 +1,7 @@ """Tests for the automated assembly pipeline.""" import tempfile +import typing as t import unittest from pathlib import Path @@ -30,7 +31,7 @@ ] -def get_test_mappings() -> list[Mapping]: +def get_test_mappings() -> t.List[Mapping]: """A test function to get mappings.""" return TEST_MAPPINGS From b87da1054c77e8f01f593174d074bd5a4af03159 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 16:32:55 +0100 Subject: [PATCH 18/23] Update test_api.py --- tests/test_api.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_api.py b/tests/test_api.py index edd1fa7..1f14c6d 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,5 +1,6 @@ from __future__ import annotations +import typing as t import unittest from semra import api @@ -24,11 +25,11 @@ from semra.struct import Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence, line, triple_key -def _get_references(n: int, prefix: str = "test") -> list[Reference]: +def _get_references(n: int, prefix: str = "test") -> t.List[Reference]: return [Reference(prefix=prefix, identifier=str(i)) for i in range(1, n + 1)] -def _exact(s, o, evidence: list[SimpleEvidence] | None = None) -> Mapping: +def _exact(s, o, evidence: t.Optional[t.List[SimpleEvidence]] = None) -> Mapping: return Mapping(s=s, p=EXACT_MATCH, o=o, evidence=evidence or []) @@ -101,8 +102,8 @@ def test_index(self): def assert_same_triples( self, - expected_mappings: Index | list[Mapping], - actual_mappings: Index | list[Mapping], + expected_mappings: t.Union[Index, t.List[Mapping]], + actual_mappings: t.Union[Index, t.List[Mapping]], msg: str | None = None, ) -> None: """Assert that two sets of mappings are the same.""" @@ -118,7 +119,7 @@ def assert_same_triples( ) @staticmethod - def _clean_index(index: Index) -> list[str]: + def _clean_index(index: Index) -> t.List[str]: triples = sorted(set(index), key=triple_key) return ["<" + ", ".join(element.curie for element in triple) + ">" for triple in triples] From c2490da9bc2490c47dd3e34f01655d05f0949868 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 16:53:01 +0100 Subject: [PATCH 19/23] Update __init__.py --- src/semra/sources/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/semra/sources/__init__.py b/src/semra/sources/__init__.py index 11ee38d..13b23ad 100644 --- a/src/semra/sources/__init__.py +++ b/src/semra/sources/__init__.py @@ -1,6 +1,7 @@ """Sources of xrefs not from OBO.""" import itertools as itt +import typing as t from collections.abc import Callable, Iterable from class_resolver import FunctionResolver @@ -43,7 +44,7 @@ "get_clo_mappings", ] -SOURCE_RESOLVER: FunctionResolver[Callable[[], list[Mapping]]] = FunctionResolver( +SOURCE_RESOLVER: FunctionResolver[Callable[[], t.List[Mapping]]] = FunctionResolver( [ get_chembl_compound_mappings, get_chembl_protein_mappings, From 840f7045571dba1456aff11e68bf4923fe6dfa4e Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 17:00:15 +0100 Subject: [PATCH 20/23] Update __init__.py --- src/semra/sources/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/semra/sources/__init__.py b/src/semra/sources/__init__.py index 13b23ad..395fd49 100644 --- a/src/semra/sources/__init__.py +++ b/src/semra/sources/__init__.py @@ -2,7 +2,6 @@ import itertools as itt import typing as t -from collections.abc import Callable, Iterable from class_resolver import FunctionResolver @@ -44,7 +43,7 @@ "get_clo_mappings", ] -SOURCE_RESOLVER: FunctionResolver[Callable[[], t.List[Mapping]]] = FunctionResolver( +SOURCE_RESOLVER: FunctionResolver[t.Callable[[], t.List[Mapping]]] = FunctionResolver( [ get_chembl_compound_mappings, get_chembl_protein_mappings, @@ -75,6 +74,6 @@ SOURCE_RESOLVER.synonyms[norm_key] = func -def get_custom() -> Iterable[Mapping]: +def get_custom() -> t.Iterable[Mapping]: """Get all custom mappings.""" return itt.chain.from_iterable(func() for func in SOURCE_RESOLVER) From d07096e741018ada1bdd7e529f38ea064326c576 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 17:16:10 +0100 Subject: [PATCH 21/23] Update pipeline.py --- src/semra/pipeline.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index 9b35e80..bd27b52 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -4,6 +4,7 @@ import logging import time +import typing as t from pathlib import Path from typing import Any, Literal, Optional @@ -79,18 +80,18 @@ class Configuration(BaseModel): name: str = Field(description="The name of the mapping set configuration") description: str = Field(description="An explanation of the purpose of the mapping set configuration") - inputs: list[Input] - negative_inputs: list[Input] = Field(default=[Input(source="biomappings", prefix="negative")]) - priority: list[str] = Field(..., description="If no priority is given, is inferred from the order of inputs") - mutations: list[Mutation] = Field(default_factory=list) + inputs: t.List[Input] + negative_inputs: t.List[Input] = Field(default=[Input(source="biomappings", prefix="negative")]) + priority: t.List[str] = Field(..., description="If no priority is given, is inferred from the order of inputs") + mutations: t.List[Mutation] = Field(default_factory=list) - exclude_pairs: list[tuple[str, str]] = Field( + exclude_pairs: t.List[tuple[str, str]] = Field( default_factory=list, description="A list of pairs of prefixes. Remove all mappings whose source " "prefix is the first in a pair and target prefix is second in a pair. Order matters.", ) - remove_prefixes: Optional[list[str]] = None - keep_prefixes: Optional[list[str]] = None + remove_prefixes: Optional[t.List[str]] = None + keep_prefixes: Optional[t.List[str]] = None remove_imprecise: bool = True validate_raw: bool = Field( default=False, @@ -128,7 +129,7 @@ def get_mappings_from_config( *, refresh_raw: bool = False, refresh_processed: bool = False, -) -> list[Mapping]: +) -> t.List[Mapping]: """Run assembly based on a configuration.""" if ( configuration.processed_pickle_path @@ -204,7 +205,7 @@ def _get_equivalence_classes(mappings, prioritized_mappings) -> dict[Reference, return rv -def get_raw_mappings(configuration: Configuration) -> list[Mapping]: +def get_raw_mappings(configuration: Configuration) -> t.List[Mapping]: """Get raw mappings based on the inputs in a configuration.""" mappings = [] for inp in tqdm(configuration.inputs, desc="Loading configured mappings", unit="source"): @@ -243,13 +244,13 @@ def get_raw_mappings(configuration: Configuration) -> list[Mapping]: def process( - mappings: list[Mapping], + mappings: t.List[Mapping], upgrade_prefixes=None, remove_prefix_set=None, keep_prefix_set=None, *, remove_imprecise: bool = True, -) -> list[Mapping]: +) -> t.List[Mapping]: """Run a full deduplication, reasoning, and inference pipeline over a set of mappings.""" from semra.sources.biopragmatics import from_biomappings_negative @@ -329,7 +330,7 @@ def process( return mappings -def _log_diff(before: int, mappings: list[Mapping], *, verb: str, elapsed) -> None: +def _log_diff(before: int, mappings: t.List[Mapping], *, verb: str, elapsed) -> None: logger.info( f"{verb} from {before:,} to {len(mappings):,} mappings (Δ={len(mappings) - before:,}) in %.2f seconds.", elapsed, From b2000b32e40268ac1ed10930bd72679e49ed0203 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 17:22:36 +0100 Subject: [PATCH 22/23] Update pipeline.py --- src/semra/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index bd27b52..891fe66 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -63,7 +63,7 @@ class Input(BaseModel): source: Literal["pyobo", "bioontologies", "biomappings", "custom", "sssom", "gilda"] prefix: Optional[str] = None confidence: float = 1.0 - extras: dict[str, Any] = Field(default_factory=dict) + extras: t.Dict[str, Any] = Field(default_factory=dict) class Mutation(BaseModel): From 2d0fa9f6369bced2ef99d9e774144173e4cff30c Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Mon, 22 Jan 2024 17:24:31 +0100 Subject: [PATCH 23/23] Update pipeline.py --- src/semra/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index 891fe66..bb3911d 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -85,7 +85,7 @@ class Configuration(BaseModel): priority: t.List[str] = Field(..., description="If no priority is given, is inferred from the order of inputs") mutations: t.List[Mutation] = Field(default_factory=list) - exclude_pairs: t.List[tuple[str, str]] = Field( + exclude_pairs: t.List[t.Tuple[str, str]] = Field( default_factory=list, description="A list of pairs of prefixes. Remove all mappings whose source " "prefix is the first in a pair and target prefix is second in a pair. Order matters.",