Skip to content

Commit

Permalink
Refactor NCIT-specific code
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Dec 5, 2023
1 parent b3aaad5 commit 81a9fb4
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 37 deletions.
34 changes: 3 additions & 31 deletions src/semra/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import itertools as itt
import logging
from collections import Counter, defaultdict
from collections.abc import Callable, Iterable
from collections.abc import Iterable
from typing import cast

import networkx as nx
Expand Down Expand Up @@ -194,7 +194,8 @@ def infer_chains(
return [*mappings, *new_mappings]


def index_str(index: Index) -> str:
def tabulate_index(index: Index) -> str:
"""Tabulate"""
from tabulate import tabulate

rows: list[tuple[str, str, str, str]] = []
Expand Down Expand Up @@ -487,35 +488,6 @@ def validate_mappings(mappings: list[Mapping]) -> None:
raise ValueError(f"banana in mapping object: {mapping}")


def df_to_mappings(
df,
*,
source_prefix: str,
target_prefix: str,
evidence: Callable[[], Evidence],
source_identifier_column: str | None = None,
target_identifier_column: str | None = None,
) -> list[Mapping]:
if source_identifier_column is None:
source_identifier_column = source_prefix
if target_identifier_column is None:
target_identifier_column = target_prefix
return [
Mapping(
s=Reference(prefix=source_prefix, identifier=source_id),
p=EXACT_MATCH,
o=Reference(prefix=target_prefix, identifier=target_id),
evidence=[evidence()],
)
for source_id, target_id in tqdm(
df[[source_identifier_column, target_identifier_column]].values,
unit="mapping",
unit_scale=True,
desc=f"Processing {source_prefix}",
)
]


def summarize_prefixes(mappings: list[Mapping]) -> pd.DataFrame:
"""Get a dataframe summarizing the prefixes appearing in the mappings."""
import bioregistry
Expand Down
44 changes: 38 additions & 6 deletions src/semra/sources/ncit.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
"""Get mappings from NCIT."""
from __future__ import annotations

from functools import lru_cache
from typing import Callable

import bioregistry
import pandas as pd
import requests
from curies import Reference
from tqdm.asyncio import tqdm

from semra import UNSPECIFIED_MAPPING, Mapping, MappingSet, SimpleEvidence
from semra.api import df_to_mappings
from semra import EXACT_MATCH, UNSPECIFIED_MAPPING, Evidence, Mapping, MappingSet, SimpleEvidence

__all__ = [
"get_ncit_hgnc_mappings",
Expand Down Expand Up @@ -50,7 +53,7 @@ def _get_evidence() -> SimpleEvidence:
def get_ncit_hgnc_mappings() -> list[Mapping]:
df = pd.read_csv(HGNC_MAPPINGS_URL, sep="\t", header=None, names=["ncit", "hgnc"])
df["hgnc"] = df["hgnc"].map(lambda s: s.removeprefix("HGNC:")) # type:ignore
return df_to_mappings(
return _df_to_mappings(
df,
source_prefix="ncit",
target_prefix="hgnc",
Expand All @@ -61,7 +64,7 @@ def get_ncit_hgnc_mappings() -> list[Mapping]:
def get_ncit_go_mappings() -> list[Mapping]:
df = pd.read_csv(HGNC_MAPPINGS_URL, sep="\t", header=None, names=["go", "ncit"])
df["go"] = df["go"].map(lambda s: s.removeprefix("GO:")) # type:ignore
return df_to_mappings(
return _df_to_mappings(
df,
source_prefix="ncit",
target_prefix="go",
Expand All @@ -72,7 +75,7 @@ def get_ncit_go_mappings() -> list[Mapping]:
def get_ncit_chebi_mappings() -> list[Mapping]:
df = pd.read_csv(HGNC_MAPPINGS_URL, sep="\t", header=None, names=["ncit", "chebi"])
df["chebi"] = df["chebi"].map(lambda s: s.removeprefix("CHEBI:")) # type:ignore
return df_to_mappings(
return _df_to_mappings(
df,
source_prefix="ncit",
target_prefix="chebi",
Expand All @@ -82,14 +85,43 @@ def get_ncit_chebi_mappings() -> list[Mapping]:

def get_ncit_uniprot_mappings() -> list[Mapping]:
df = pd.read_csv(SWISSPROT_MAPPINGS_URL, sep="\t", header=None, names=["ncit", "uniprot"])
return df_to_mappings(
return _df_to_mappings(
df,
source_prefix="ncit",
target_prefix="uniprot",
evidence=_get_evidence,
)


def _df_to_mappings(
df,
*,
source_prefix: str,
target_prefix: str,
evidence: Callable[[], Evidence],
source_identifier_column: str | None = None,
target_identifier_column: str | None = None,
) -> list[Mapping]:
if source_identifier_column is None:
source_identifier_column = source_prefix
if target_identifier_column is None:
target_identifier_column = target_prefix
return [
Mapping(
s=Reference(prefix=source_prefix, identifier=source_id),
p=EXACT_MATCH,
o=Reference(prefix=target_prefix, identifier=target_id),
evidence=[evidence()],
)
for source_id, target_id in tqdm(
df[[source_identifier_column, target_identifier_column]].values,
unit="mapping",
unit_scale=True,
desc=f"Processing {source_prefix}",
)
]


if __name__ == "__main__":
from semra.api import print_source_target_counts

Expand Down

0 comments on commit 81a9fb4

Please sign in to comment.