Skip to content

Commit

Permalink
Improvements to full mapping database build
Browse files Browse the repository at this point in the history
1. Add zenodo uploads
2. Write configuration to output directory
3. Demonstrate automated upload on protein complex landscape
4. Add automated upload to full database build
  • Loading branch information
cthoyt committed Apr 24, 2024
1 parent cc62889 commit bb94e94
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 22 deletions.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ install_requires =
bioontologies
pyobo
typing_extensions
zenodo_client

# Random options
zip_safe = false
Expand Down
82 changes: 62 additions & 20 deletions src/semra/database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Assemble a database."""

import pickle
import csv
import time
import typing as t

import bioregistry
import click
Expand All @@ -11,21 +12,31 @@
from bioontologies.robot import write_getter_warnings
from tqdm.auto import tqdm
from tqdm.contrib.logging import logging_redirect_tqdm

from semra.io import from_bioontologies, from_pyobo, write_neo4j, write_pickle, write_sssom
from zenodo_client import Creator, Metadata, ensure_zenodo

from semra import Mapping
from semra.io import (
from_bioontologies,
from_pickle,
from_pyobo,
write_neo4j,
write_pickle,
write_sssom,
)
from semra.rules import CHARLIE_NAME, CHARLIE_ORCID
from semra.sources import SOURCE_RESOLVER

MODULE = pystow.module("semra", "database")
SOURCES = MODULE.module("sources")
DATABASE_PATH = MODULE.join(name="sssom.tsv")
WARNINGS_PATH = MODULE.join("logs", name="warnings.tsv")
ERRORS_PATH = MODULE.join("logs", name="errors.tsv")
SUMMARY_PATH = MODULE.join("logs", name="summary.tsv")
EMPTY_PATH = MODULE.join("logs", name="empty.txt")
LOGS = MODULE.module("logs")
SSSOM_PATH = MODULE.join(name="sssom.tsv")
WARNINGS_PATH = LOGS.join(name="warnings.tsv")
ERRORS_PATH = LOGS.join(name="errors.tsv")
SUMMARY_PATH = LOGS.join(name="summary.tsv")
EMPTY_PATH = LOGS.join(name="empty.txt")
NEO4J_DIR = MODULE.join("neo4j")

EMPTY = []

summaries = []


Expand All @@ -45,6 +56,8 @@ def main():
"edam.format",
"edam.operation",
"edam.topic",
"gwascentral.phenotype", # added on 2024-04-24, service down
"gwascentral.study", # added on 2024-04-24, service down
}
#: A set of prefixes whose obo files need to be parsed without ROBOT checks
loose = {
Expand Down Expand Up @@ -79,7 +92,7 @@ def main():
continue
_write_source(resource_mappings, resource.prefix)
mappings.extend(resource_mappings)
summaries.append((resource.prefix, len(resource_mappings), time.time() - start))
summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "pyobo"))
_write_summary()

it = tqdm(list(SOURCE_RESOLVER), unit="source", desc="Custom sources")
Expand All @@ -91,15 +104,15 @@ def main():
resource_mappings = func()
_write_source(resource_mappings, resource_name)
mappings.extend(resource_mappings)
summaries.append((resource_name, len(resource_mappings), time.time() - start))
summaries.append((resource_name, len(resource_mappings), time.time() - start, "custom"))
_write_summary()

it = tqdm(ontology_resources, unit="ontology", desc="Ontology sources")
for resource in it:
it.set_postfix(prefix=resource.prefix)
path = SOURCES.join(name=f"{resource.prefix}.pkl")
if path.is_file():
resource_mappings = pickle.loads(path.read_bytes())
resource_mappings = from_pickle(path)
else:
start = time.time()
try:
Expand All @@ -112,18 +125,43 @@ def main():
# this outputs on each iteration to get faster insight
write_warned(WARNINGS_PATH)
write_getter_warnings(ERRORS_PATH)
summaries.append((resource.prefix, len(resource_mappings), time.time() - start))
summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "bioontologies"))
_write_summary()

mappings.extend(resource_mappings)

click.echo(f"Writing SSSOM to {DATABASE_PATH}")
write_sssom(mappings, DATABASE_PATH)
click.echo(f"Writing Neo4j folder to {DATABASE_PATH}")
click.echo(f"Writing SSSOM to {SSSOM_PATH}")
write_sssom(mappings, SSSOM_PATH)
click.echo(f"Writing Neo4j folder to {SSSOM_PATH}")
write_neo4j(mappings, NEO4J_DIR)


def _write_source(mappings, key):
# Define the metadata that will be used on initial upload
zenodo_metadata = Metadata(
title="SeMRA Mapping Database",
upload_type="dataset",
description=f"A compendium of mappings extracted from {len(summaries)} database/ontologies. "
f"Note that primary mappings are marked with the license of their source (when available). "
f"Inferred mappings are distributed under the CC0 license.",
creators=[
Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier),
],
)
res = ensure_zenodo(
key="semra-database-test-1",
data=zenodo_metadata,
paths=[
SSSOM_PATH,
WARNINGS_PATH,
ERRORS_PATH,
SUMMARY_PATH,
*NEO4J_DIR.iterdir(),
],
sandbox=True,
)
click.echo(res.json()["links"]["html"])


def _write_source(mappings: t.List[Mapping], key: str) -> None:
write_pickle(mappings, SOURCES.join(name=f"{key}.pkl"))
if mappings:
write_sssom(mappings, SOURCES.join(name=f"{key}.sssom.tsv"))
Expand All @@ -132,8 +170,12 @@ def _write_source(mappings, key):
EMPTY_PATH.write_text("\n".join(EMPTY))


def _write_summary():
SUMMARY_PATH.write_text("\n".join(f"{p}\t{n:,}\t{round(delta, 3)}" for p, n, delta in summaries))
def _write_summary() -> None:
with SUMMARY_PATH.open("w") as file:
writer = csv.writer(file, delimiter="\t")
writer.writerow(("prefix", "mappings", "time", "source_type"))
for prefix, n_mappings, time_delta, source_type in summaries:
writer.writerow((prefix, f"{n_mappings:,}", round(time_delta, 3), source_type))


if __name__ == "__main__":
Expand Down
33 changes: 31 additions & 2 deletions src/semra/landscape/complexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import click
import pystow
from zenodo_client import Creator, Metadata, ensure_zenodo

from semra.pipeline import Configuration, Input, Mutation
from semra.rules import CHARLIE_NAME, CHARLIE_ORCID

__all__ = [
"MODULE",
Expand Down Expand Up @@ -51,20 +53,47 @@
],
raw_pickle_path=MODULE.join(name="raw.pkl"),
raw_sssom_path=MODULE.join(name="raw.sssom.tsv"),
# raw_neo4j_path=MODULE.join("neo4j_raw"),
raw_neo4j_path=MODULE.join("neo4j_raw"),
raw_neo4j_name="semra-complex",
processed_pickle_path=MODULE.join(name="processed.pkl"),
processed_sssom_path=MODULE.join(name="processed.sssom.tsv"),
processed_neo4j_path=MODULE.join("neo4j"),
processed_neo4j_name="semra-complex",
priority_pickle_path=MODULE.join(name="priority.pkl"),
priority_sssom_path=MODULE.join(name="priority.sssom.tsv"),
configuration_path=MODULE.join(name="configuration.json"),
)


# Define the metadata that will be used on initial upload
ZENODO_METADATA = Metadata(
title="SeMRA Protein Complex Mapping Database",
upload_type="dataset",
description=CONFIGURATION.description,
creators=[
Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier),
],
)


@click.command()
def main():
"""Build the mapping database for protein complex terms."""
CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True)
# CONFIGURATION.get_mappings(refresh_raw=False, refresh_processed=False)

res = ensure_zenodo(
key="semra-complex",
data=ZENODO_METADATA,
paths=[
CONFIGURATION.raw_sssom_path,
CONFIGURATION.configuration_path,
CONFIGURATION.processed_sssom_path,
CONFIGURATION.priority_sssom_path,
*CONFIGURATION.raw_neo4j_path.iterdir(),
],
sandbox=True,
)
click.echo(res.json()["links"]["html"])


if __name__ == "__main__":
Expand Down
7 changes: 7 additions & 0 deletions src/semra/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ class Configuration(BaseModel):

add_labels: bool = Field(default=False, description="Should PyOBO be used to look up labels for SSSOM output?")

configuration_path: Optional[Path] = Field(None, description="The path where this configuration should be written.")

@root_validator(skip_on_failure=True)
def infer_priority(cls, values): # noqa:N805
"""Infer the priority from the input list of not given."""
Expand Down Expand Up @@ -214,6 +216,11 @@ def get_mappings_from_config(
"loaded cached raw mappings from %s in %.2f seconds", configuration.raw_pickle_path, time.time() - start
)
else:
if configuration.configuration_path is not None:
configuration.configuration_path.write_text(
configuration.model_dump_json(exclude_none=True, exclude_unset=True, indent=2)
)

raw_mappings = get_raw_mappings(configuration)
if configuration.validate_raw:
validate_mappings(raw_mappings)
Expand Down
1 change: 1 addition & 0 deletions src/semra/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@
KNOWLEDGE_MAPPING = Reference.from_curie("semapv:BackgroundKnowledgeBasedMatching")

CHARLIE_ORCID = Reference.from_curie("orcid:0000-0003-4423-4370")
CHARLIE_NAME = "Charles Tapley Hoyt"
BEN_ORCID = Reference.from_curie("orcid:0000-0001-9439-5346")

0 comments on commit bb94e94

Please sign in to comment.