Skip to content

Commit

Permalink
Update pubchem-mesh source
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Apr 24, 2024
1 parent bb94e94 commit 93d0e84
Showing 1 changed file with 24 additions and 26 deletions.
50 changes: 24 additions & 26 deletions src/semra/sources/pubchem.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from __future__ import annotations

import logging
from typing import Optional
from typing import Optional, Set

import bioversions
import pandas as pd
import pyobo
import requests
from curies import Reference

from semra.rules import EXACT_MATCH, UNSPECIFIED_MAPPING
Expand All @@ -25,29 +25,26 @@ def get_pubchem_mesh_mappings(version: Optional[str] = None) -> list[Mapping]:
"""Get a mapping from PubChem compound identifiers to their equivalent MeSH terms."""
if version is None:
version = bioversions.get_version("pubchem")
url = f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/CID-MeSH"
df = pd.read_csv(
url,
dtype=str,
header=None,
names=["pubchem", "mesh"],
)

mesh_name_to_id = pyobo.get_name_id_mapping("mesh")
needs_curation = set()
mesh_ids = []
for name in df["mesh"]:
mesh_id = mesh_name_to_id.get(name)
if mesh_id is None and name not in needs_curation:
needs_curation.add(name)
logger.debug("[mesh] needs curating: %s", name)
mesh_ids.append(mesh_id)
logger.info("[mesh] %d/%d need updating", len(needs_curation), len(mesh_ids))
df["mesh"] = mesh_ids

return [
Mapping(
needs_curation: Set[str] = set()

url = f"https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/CID-MeSH"
res = requests.get(url, stream=True)

rv = []
for line in res.iter_lines():
# on a small number of entries, there are multiple names. their impact is negligible
pubchem, mesh_name, *_ = line.decode("utf8").strip().split("\t")
mesh_id = mesh_name_to_id.get(mesh_name)
if mesh_id is None:
if mesh_name not in needs_curation:
needs_curation.add(mesh_name)
logger.debug("[mesh] needs curating: %s", mesh_name)
continue
mapping = Mapping(
s=Reference(prefix="pubchem.compound", identifier=pubchem),
o=Reference(prefix="mesh", identifier=mesh),
o=Reference(prefix="mesh", identifier=mesh_id),
p=EXACT_MATCH,
evidence=[
SimpleEvidence(
Expand All @@ -57,6 +54,7 @@ def get_pubchem_mesh_mappings(version: Optional[str] = None) -> list[Mapping]:
)
],
)
for pubchem, mesh in df.values
if mesh is not None
]
rv.append(mapping)

logger.warning("[pubchem-mesh] %d MeSH names need manual curation", len(needs_curation))
return rv

0 comments on commit 93d0e84

Please sign in to comment.