diff --git a/README.md b/README.md index ea942ef..decefdf 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,9 @@ This software provides: 4. A confidence model granular at the curator-level, mapping set-level, and community feedback-level +We also provide an accompanying raw semantic mapping database on Zenodo at +https://zenodo.org/records/11082039. + ## 🚀 Installation The most recent release can be installed from diff --git a/notebooks/landscape/README.md b/notebooks/landscape/README.md index 7b18a4e..10f9ea0 100644 --- a/notebooks/landscape/README.md +++ b/notebooks/landscape/README.md @@ -32,10 +32,10 @@ resource, how many show up in all resources, and how many show up in a few A summary chart over all landscapes can be generated with `landscape.py`. -| name | raw_term_count | unique_term_count | reduction | -|---------|---------------:|------------------:|----------:| -| disease | 410173 | 243730 | 0.405787 | -| anatomy | 37917 | 32108 | 0.153203 | -| complex | 15869 | 7775 | 0.510051 | -| gene | 4.94578e+07 | 4.87886e+07 | 0.013529 | -| cell | 207019 | 166274 | 0.196818 | \ No newline at end of file +| name | raw_term_count | unique_term_count | reduction | download | +|---------|---------------:|------------------:|----------:|------------------------------------------------------------------------:| +| disease | 410,173 | 243,730 | 0.405787 | [zenodo.record:11091886](https://bioregistry.io/zenodo.record:11091886) | +| anatomy | 37,917 | 32,108 | 0.153203 | [zenodo.record:11091803](https://bioregistry.io/zenodo.record:11091803) | +| complex | 15,869 | 7,775 | 0.510051 | [zenodo.record:11091422](https://bioregistry.io/zenodo.record:11091422) | +| gene | 49,457,767 | 207,019 | 0.013529 | [zenodo.record:11092013](https://bioregistry.io/zenodo.record:11092013) | +| cell | 207,019 | 166,274 | 0.196818 | [zenodo.record:11091581](https://bioregistry.io/zenodo.record:11091581) | diff --git a/notebooks/landscape/anatomy/configuration.json b/notebooks/landscape/anatomy/configuration.json new file mode 100644 index 0000000..f002bcd --- /dev/null +++ b/notebooks/landscape/anatomy/configuration.json @@ -0,0 +1,129 @@ +{ + "name": "SeMRA Anatomy Mappings Database", + "description": "Supports the analysis of the landscape of anatomy nomenclature resources.", + "creators": [ + { + "name": "Charles Tapley Hoyt", + "orcid": "0000-0003-4423-4370" + } + ], + "inputs": [ + { + "source": "biomappings" + }, + { + "source": "gilda" + }, + { + "source": "pyobo", + "prefix": "uberon", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "bto", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "caro", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "mesh", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "ncit", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "umls", + "confidence": 0.99 + } + ], + "priority": [ + "uberon", + "mesh", + "bto", + "caro", + "ncit", + "umls" + ], + "mutations": [ + { + "source": "uberon", + "confidence": 0.8 + }, + { + "source": "bto", + "confidence": 0.65 + }, + { + "source": "caro", + "confidence": 0.8 + }, + { + "source": "ncit", + "confidence": 0.7 + }, + { + "source": "umls", + "confidence": 0.7 + } + ], + "subsets": { + "mesh": [ + "mesh:D001829", + "mesh:D009141", + "mesh:D004064", + "mesh:D012137", + "mesh:D014566", + "mesh:D004703", + "mesh:D002319", + "mesh:D009420", + "mesh:D012679", + "mesh:D014024", + "mesh:D005441", + "mesh:D000825", + "mesh:D013284", + "mesh:D006424", + "mesh:D004628", + "mesh:D034582", + "mesh:D018514", + "mesh:D056229", + "mesh:D056226", + "mesh:D056224" + ], + "ncit": [ + "ncit:C12219" + ], + "umls": [ + "sty:T024", + "sty:T017" + ] + }, + "keep_prefixes": [ + "uberon", + "mesh", + "bto", + "caro", + "ncit", + "umls" + ], + "remove_imprecise": false, + "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/raw.pkl", + "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/raw.sssom.tsv", + "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/processed.pkl", + "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/processed.sssom.tsv", + "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/neo4j", + "processed_neo4j_name": "semra-anatomy", + "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/priority.pkl", + "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/priority.sssom.tsv", + "add_labels": false, + "configuration_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/configuration.json", + "zenodo_record": 11091803 +} \ No newline at end of file diff --git a/notebooks/landscape/cell/configuration.json b/notebooks/landscape/cell/configuration.json new file mode 100644 index 0000000..2e719b4 --- /dev/null +++ b/notebooks/landscape/cell/configuration.json @@ -0,0 +1,159 @@ +{ + "name": "SeMRA Cell and Cell Line Mappings Database", + "description": "Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario posed in the Biomappings paper, this configuration imports several different cell and cell line resources and identifies mappings between them.", + "creators": [ + { + "name": "Charles Tapley Hoyt", + "orcid": "0000-0003-4423-4370" + } + ], + "inputs": [ + { + "source": "biomappings" + }, + { + "source": "gilda" + }, + { + "source": "pyobo", + "prefix": "cellosaurus", + "confidence": 0.99 + }, + { + "source": "bioontologies", + "prefix": "bto", + "confidence": 0.99 + }, + { + "source": "bioontologies", + "prefix": "cl", + "confidence": 0.99 + }, + { + "source": "custom", + "prefix": "clo", + "confidence": 0.65 + }, + { + "source": "pyobo", + "prefix": "efo", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "depmap", + "confidence": 0.99, + "extras": { + "version": "22Q4", + "standardize": true, + "license": "CC-BY-4.0" + } + }, + { + "source": "pyobo", + "prefix": "ccle", + "confidence": 0.99, + "extras": { + "version": "2019" + } + }, + { + "source": "pyobo", + "prefix": "ncit", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "umls", + "confidence": 0.99 + } + ], + "priority": [ + "mesh", + "efo", + "cellosaurus", + "ccle", + "depmap", + "bto", + "cl", + "clo", + "ncit", + "umls" + ], + "mutations": [ + { + "source": "efo", + "confidence": 0.7 + }, + { + "source": "bto", + "confidence": 0.7 + }, + { + "source": "cl", + "confidence": 0.7 + }, + { + "source": "clo", + "confidence": 0.7 + }, + { + "source": "depmap", + "confidence": 0.7 + }, + { + "source": "ccle", + "confidence": 0.7 + }, + { + "source": "cellosaurus", + "confidence": 0.7 + }, + { + "source": "ncit", + "confidence": 0.7 + }, + { + "source": "umls", + "confidence": 0.7 + } + ], + "subsets": { + "mesh": [ + "mesh:D002477" + ], + "efo": [ + "efo:0000324" + ], + "ncit": [ + "ncit:C12508" + ], + "umls": [ + "sty:T025" + ] + }, + "keep_prefixes": [ + "mesh", + "efo", + "cellosaurus", + "ccle", + "depmap", + "bto", + "cl", + "clo", + "ncit", + "umls" + ], + "remove_imprecise": false, + "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/raw.pkl", + "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/raw.sssom.tsv", + "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/processed.pkl", + "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/processed.sssom.tsv", + "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/cells/neo4j", + "processed_neo4j_name": "semra-cell", + "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/priority.pkl", + "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/priority.sssom.tsv", + "add_labels": true, + "configuration_path": "/Users/cthoyt/.data/semra/case-studies/cells/configuration.json", + "zenodo_record": 11091581 +} \ No newline at end of file diff --git a/notebooks/landscape/complex/configuration.json b/notebooks/landscape/complex/configuration.json new file mode 100644 index 0000000..607e877 --- /dev/null +++ b/notebooks/landscape/complex/configuration.json @@ -0,0 +1,98 @@ +{ + "name": "SeMRA Protein Complex Landscape Analysis", + "description": "Analyze the landscape of protein complex nomenclature resources, species-agnostic.", + "creators": [ + { + "name": "Charles Tapley Hoyt", + "orcid": "0000-0003-4423-4370" + } + ], + "inputs": [ + { + "source": "gilda" + }, + { + "source": "biomappings" + }, + { + "source": "pyobo", + "prefix": "fplx", + "confidence": 0.99 + }, + { + "source": "custom", + "prefix": "fplx", + "confidence": 0.99 + }, + { + "source": "custom", + "prefix": "intact_complexportal", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "complexportal", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "go", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "complexportal", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "reactome", + "confidence": 0.99 + } + ], + "priority": [ + "complexportal", + "fplx", + "go", + "chembl.target", + "wikidata", + "scomp", + "signor", + "intact" + ], + "mutations": [ + { + "source": "go", + "confidence": 0.95 + } + ], + "subsets": { + "go": [ + "go:0032991" + ] + }, + "post_keep_prefixes": [ + "complexportal", + "fplx", + "go", + "chembl.target", + "wikidata", + "scomp", + "signor", + "intact" + ], + "remove_imprecise": false, + "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/complex/raw.pkl", + "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/complex/raw.sssom.tsv", + "raw_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/complex/neo4j_raw", + "raw_neo4j_name": "semra-complex", + "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/complex/processed.pkl", + "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/complex/processed.sssom.tsv", + "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/complex/neo4j", + "processed_neo4j_name": "semra-complex", + "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/complex/priority.pkl", + "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/complex/priority.sssom.tsv", + "add_labels": true, + "configuration_path": "/Users/cthoyt/.data/semra/case-studies/complex/configuration.json", + "zenodo_record": 11091422 +} \ No newline at end of file diff --git a/notebooks/landscape/disease/configuration.json b/notebooks/landscape/disease/configuration.json new file mode 100644 index 0000000..f796fe5 --- /dev/null +++ b/notebooks/landscape/disease/configuration.json @@ -0,0 +1,157 @@ +{ + "name": "SeMRA Disease Mappings Database", + "description": "Supports the analysis of the landscape of disease nomenclature resources.", + "creators": [ + { + "name": "Charles Tapley Hoyt", + "orcid": "0000-0003-4423-4370" + } + ], + "inputs": [ + { + "source": "biomappings" + }, + { + "source": "gilda" + }, + { + "source": "bioontologies", + "prefix": "doid", + "confidence": 0.99 + }, + { + "source": "bioontologies", + "prefix": "mondo", + "confidence": 0.99 + }, + { + "source": "bioontologies", + "prefix": "efo", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "mesh", + "confidence": 0.99 + }, + { + "source": "bioontologies", + "prefix": "ncit", + "confidence": 0.85 + }, + { + "source": "pyobo", + "prefix": "umls", + "confidence": 0.9 + }, + { + "source": "bioontologies", + "prefix": "orphanet.ordo", + "confidence": 0.9 + } + ], + "priority": [ + "doid", + "mondo", + "efo", + "mesh", + "ncit", + "orphanet", + "orphanet.ordo", + "umls", + "omim", + "omim.ps", + "gard", + "icd10", + "icd10cm", + "icd10pcs", + "icd11", + "icd9", + "icd9cm", + "icdo" + ], + "mutations": [ + { + "source": "doid", + "confidence": 0.95 + }, + { + "source": "mondo", + "confidence": 0.95 + }, + { + "source": "efo", + "confidence": 0.9 + }, + { + "source": "ncit", + "confidence": 0.7 + }, + { + "source": "umls", + "confidence": 0.7 + }, + { + "source": "orphanet.ordo", + "confidence": 0.7 + }, + { + "source": "orphanet", + "confidence": 0.7 + } + ], + "subsets": { + "mesh": [ + "mesh:D007239", + "mesh:D001520", + "mesh:D011579", + "mesh:D001523", + "mesh:D004191" + ], + "efo": [ + "efo:0000408" + ], + "ncit": [ + "ncit:C2991" + ], + "umls": [ + "sty:T049", + "sty:T047", + "sty:T191", + "sty:T050", + "sty:T048" + ] + }, + "keep_prefixes": [ + "doid", + "mondo", + "efo", + "mesh", + "ncit", + "orphanet", + "orphanet.ordo", + "umls", + "omim", + "omim.ps", + "gard", + "icd10", + "icd10cm", + "icd10pcs", + "icd11", + "icd9", + "icd9cm", + "icdo" + ], + "remove_imprecise": false, + "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/disease/raw.pkl", + "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/disease/raw.sssom.tsv", + "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/disease/processed.pkl", + "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/disease/processed.sssom.tsv", + "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/disease/neo4j", + "processed_neo4j_name": "semra-disease", + "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/disease/priority.pkl", + "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/disease/priority.sssom.tsv", + "add_labels": true, + "configuration_path": "/Users/cthoyt/.data/semra/case-studies/disease/configuration.json", + "zenodo_record": 11091886 +} \ No newline at end of file diff --git a/notebooks/landscape/gene/configuration.json b/notebooks/landscape/gene/configuration.json new file mode 100644 index 0000000..c76639c --- /dev/null +++ b/notebooks/landscape/gene/configuration.json @@ -0,0 +1,132 @@ +{ + "name": "SeMRA Gene Mapping Database", + "description": "Analyze the landscape of gene nomenclature resources, species-agnostic.", + "creators": [ + { + "name": "Charles Tapley Hoyt", + "orcid": "0000-0003-4423-4370" + } + ], + "inputs": [ + { + "source": "pyobo", + "prefix": "hgnc", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "mgi", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "rgd", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "cgnc", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "sgd", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "civic.gid", + "confidence": 0.99 + }, + { + "source": "pyobo", + "prefix": "flybase", + "confidence": 0.99 + }, + { + "source": "custom", + "prefix": "ncit_hgnc", + "confidence": 0.99 + }, + { + "source": "custom", + "prefix": "omim_gene", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "ncbigene", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "civic.gid", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "ensembl", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "hgnc", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "omim", + "confidence": 0.99 + }, + { + "source": "wikidata", + "prefix": "umls", + "confidence": 0.99 + } + ], + "priority": [ + "ncbigene", + "hgnc", + "mgi", + "rgd", + "cgnc", + "wormbase", + "flybase", + "sgd", + "omim", + "civic.gid", + "umls", + "ncit", + "wikidata" + ], + "mutations": [ + { + "source": "umls", + "confidence": 0.8 + }, + { + "source": "ncit", + "confidence": 0.8 + } + ], + "subsets": { + "umls": [ + "umls:C0017337" + ], + "ncit": [ + "ncit:C16612" + ] + }, + "remove_imprecise": false, + "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/gene/raw.pkl.gz", + "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/gene/raw.sssom.tsv.gz", + "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/gene/processed.pkl.gz", + "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/gene/processed.sssom.tsv.gz", + "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/gene/neo4j", + "processed_neo4j_name": "semra-gene", + "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/gene/priority.pkl.gz", + "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/gene/priority.sssom.tsv.gz", + "add_labels": true, + "configuration_path": "/Users/cthoyt/.data/semra/case-studies/gene/configuration.json", + "zenodo_record": 11092013 +} \ No newline at end of file diff --git a/notebooks/landscape/landscape.py b/notebooks/landscape/landscape.py index 1283810..9caccc5 100644 --- a/notebooks/landscape/landscape.py +++ b/notebooks/landscape/landscape.py @@ -5,6 +5,8 @@ import pandas as pd +from semra import Configuration + HERE = Path(__file__).parent.resolve() @@ -14,15 +16,23 @@ def main() -> None: for directory in HERE.iterdir(): if not directory.is_dir(): continue - path = directory.joinpath("stats.json") - if not path.is_file(): + + row = {"name": directory.name} + + statistics_path = directory.joinpath("stats.json") + if not statistics_path.is_file(): continue - row = json.loads(path.read_text()) - row["name"] = directory.name + row.update(json.loads(statistics_path.read_text())) + + configuration_path = directory.joinpath("configuration.json") + configuration = Configuration.parse_file(configuration_path) + row["zenodo"] = configuration.zenodo_url() rows.append(row) df = pd.DataFrame(rows).set_index("name") - df = df[["raw_term_count", "unique_term_count", "reduction"]] - print(df.to_markdown(tablefmt="github")) + df = df[["raw_term_count", "unique_term_count", "reduction", "zenodo"]] + df["reduction"] = df["reduction"].map(lambda r: f"{r:.1%}") + df = df.astype(str) + print(df.to_latex(label="landscape-summary-table", caption="")) if __name__ == "__main__": diff --git a/setup.cfg b/setup.cfg index 98b319e..c0d66a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,6 +65,7 @@ install_requires = bioontologies pyobo typing_extensions + zenodo_client # Random options zip_safe = false diff --git a/src/semra/database.py b/src/semra/database.py index d684f64..6b0e3c3 100644 --- a/src/semra/database.py +++ b/src/semra/database.py @@ -1,58 +1,75 @@ """Assemble a database.""" -import pickle +import csv import time +import typing as t import bioregistry import click import pyobo import pystow +import requests from bioontologies.obograph import write_warned from bioontologies.robot import write_getter_warnings from tqdm.auto import tqdm from tqdm.contrib.logging import logging_redirect_tqdm - -from semra.io import from_bioontologies, from_pyobo, write_neo4j, write_pickle, write_sssom +from zenodo_client import Creator, Metadata, ensure_zenodo + +from semra import Mapping +from semra.io import ( + from_bioontologies, + from_pickle, + from_pyobo, + write_neo4j, + write_pickle, + write_sssom, +) +from semra.rules import CHARLIE_NAME, CHARLIE_ORCID from semra.sources import SOURCE_RESOLVER +from semra.sources.wikidata import get_wikidata_mappings_by_prefix MODULE = pystow.module("semra", "database") SOURCES = MODULE.module("sources") -DATABASE_PATH = MODULE.join(name="sssom.tsv") -WARNINGS_PATH = MODULE.join("logs", name="warnings.tsv") -ERRORS_PATH = MODULE.join("logs", name="errors.tsv") -SUMMARY_PATH = MODULE.join("logs", name="summary.tsv") -EMPTY_PATH = MODULE.join("logs", name="empty.txt") +LOGS = MODULE.module("logs") +SSSOM_PATH = MODULE.join(name="mappings.sssom.tsv.gz") +PICKLE_PATH = MODULE.join(name="mappings.pkl.gz") +WARNINGS_PATH = LOGS.join(name="warnings.tsv") +ERRORS_PATH = LOGS.join(name="errors.tsv") +SUMMARY_PATH = LOGS.join(name="summary.tsv") +EMPTY_PATH = LOGS.join(name="empty.txt") NEO4J_DIR = MODULE.join("neo4j") EMPTY = [] - summaries = [] +skip = { + "ado", # trash + "epio", # trash + "chebi", # too big + "pr", # too big + "ncbitaxon", # too big + "ncit", # too big + "ncbigene", # too big + # duplicates of EDAM + "edam.data", + "edam.format", + "edam.operation", + "edam.topic", + "gwascentral.phenotype", # added on 2024-04-24, service down + "gwascentral.study", # added on 2024-04-24, service down +} +#: A set of prefixes whose obo files need to be parsed without ROBOT checks +loose = { + "caloha", + "foodon", + "cellosaurus", +} + @click.command() -def main(): +@click.option("--include-wikidata") +def main(include_wikidata: bool): """Construct the full SeMRA database.""" - skip = { - "ado", # trash - "epio", # trash - "chebi", # too big - "pr", # too big - "ncbitaxon", # too big - "ncit", # too big - "ncbigene", # too big - # duplicates of EDAM - "edam.data", - "edam.format", - "edam.operation", - "edam.topic", - } - #: A set of prefixes whose obo files need to be parsed without ROBOT checks - loose = { - "caloha", - "foodon", - "cellosaurus", - } - ontology_resources = [] pyobo_resources = [] for resource in bioregistry.resources(): @@ -79,27 +96,53 @@ def main(): continue _write_source(resource_mappings, resource.prefix) mappings.extend(resource_mappings) - summaries.append((resource.prefix, len(resource_mappings), time.time() - start)) + summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "pyobo")) _write_summary() it = tqdm(list(SOURCE_RESOLVER), unit="source", desc="Custom sources") for func in it: start = time.time() resource_name = func.__name__.removeprefix("get_").removesuffix("_mappings") + if resource_name == "wikidata": + # this one needs extra informatzi + continue it.set_postfix(source=resource_name) with logging_redirect_tqdm(): resource_mappings = func() _write_source(resource_mappings, resource_name) mappings.extend(resource_mappings) - summaries.append((resource_name, len(resource_mappings), time.time() - start)) + summaries.append((resource_name, len(resource_mappings), time.time() - start, "custom")) _write_summary() + skip_wikidata_prefixes = { + "pubmed", # too big! need paging? + "doi", # too big! need paging? + "inchi", # too many funny characters + "smiles", # too many funny characters + } + if include_wikidata: + for prefix in tqdm(bioregistry.get_registry_map("wikidata"), unit="property", desc="Wikidata"): + it.set_postfix(prefix=prefix) + if prefix in skip_wikidata_prefixes: + continue + start = time.time() + resource_name = f"wikidata_{prefix}" + try: + resource_mappings = get_wikidata_mappings_by_prefix(prefix) + except requests.exceptions.JSONDecodeError as e: + tqdm.write(f"[{resource_name}] failed to get mappings from wikidata: {e}") + continue + _write_source(resource_mappings, resource_name) + mappings.extend(resource_mappings) + summaries.append((resource_name, len(resource_mappings), time.time() - start, "wikidata")) + _write_summary() + it = tqdm(ontology_resources, unit="ontology", desc="Ontology sources") for resource in it: it.set_postfix(prefix=resource.prefix) - path = SOURCES.join(name=f"{resource.prefix}.pkl") + path = SOURCES.join(name=f"{resource.prefix}.pkl.gz") if path.is_file(): - resource_mappings = pickle.loads(path.read_bytes()) + resource_mappings = from_pickle(path) else: start = time.time() try: @@ -112,28 +155,59 @@ def main(): # this outputs on each iteration to get faster insight write_warned(WARNINGS_PATH) write_getter_warnings(ERRORS_PATH) - summaries.append((resource.prefix, len(resource_mappings), time.time() - start)) + summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "bioontologies")) _write_summary() mappings.extend(resource_mappings) - click.echo(f"Writing SSSOM to {DATABASE_PATH}") - write_sssom(mappings, DATABASE_PATH) - click.echo(f"Writing Neo4j folder to {DATABASE_PATH}") + click.echo(f"Writing SSSOM to {SSSOM_PATH}") + write_sssom(mappings, SSSOM_PATH) + click.echo(f"Writing Pickle to {PICKLE_PATH}") + write_pickle(mappings, PICKLE_PATH) + click.echo(f"Writing Neo4j folder to {NEO4J_DIR}") write_neo4j(mappings, NEO4J_DIR) - -def _write_source(mappings, key): - write_pickle(mappings, SOURCES.join(name=f"{key}.pkl")) + # Define the metadata that will be used on initial upload + zenodo_metadata = Metadata( + title="SeMRA Mapping Database", + upload_type="dataset", + description=f"A compendium of mappings extracted from {len(summaries)} database/ontologies. " + f"Note that primary mappings are marked with the license of their source (when available). " + f"Inferred mappings are distributed under the CC0 license.", + creators=[ + Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier), + ], + ) + res = ensure_zenodo( + key="semra-database-test-1", + data=zenodo_metadata, + paths=[ + SSSOM_PATH, + WARNINGS_PATH, + ERRORS_PATH, + SUMMARY_PATH, + *NEO4J_DIR.iterdir(), + ], + sandbox=True, + ) + click.echo(res.json()["links"]["html"]) + + +def _write_source(mappings: t.List[Mapping], key: str) -> None: if mappings: - write_sssom(mappings, SOURCES.join(name=f"{key}.sssom.tsv")) + write_pickle(mappings, SOURCES.join(name=f"{key}.pkl.gz")) + write_sssom(mappings, SOURCES.join(name=f"{key}.sssom.tsv"), add_labels=True) else: EMPTY.append(key) EMPTY_PATH.write_text("\n".join(EMPTY)) -def _write_summary(): - SUMMARY_PATH.write_text("\n".join(f"{p}\t{n:,}\t{round(delta, 3)}" for p, n, delta in summaries)) +def _write_summary() -> None: + with SUMMARY_PATH.open("w") as file: + writer = csv.writer(file, delimiter="\t") + writer.writerow(("prefix", "mappings", "seconds", "source_type")) + for prefix, n_mappings, time_delta, source_type in summaries: + writer.writerow((prefix, n_mappings, round(time_delta, 2), source_type)) if __name__ == "__main__": diff --git a/src/semra/io.py b/src/semra/io.py index a00cc71..3b5209b 100644 --- a/src/semra/io.py +++ b/src/semra/io.py @@ -2,6 +2,7 @@ from __future__ import annotations +import csv import gzip import logging import pickle @@ -21,6 +22,7 @@ import requests from bioregistry import Collection from tqdm.autonotebook import tqdm +from tqdm.contrib.logging import logging_redirect_tqdm from semra.rules import DB_XREF, UNSPECIFIED_MAPPING from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence @@ -50,6 +52,11 @@ #: node to the mapping node(s) from which it was derived DERIVED_PREDICATE = "derivedFromMapping" +HAS_AUTHOR_PREDICATE = "hasAuthor" + +#: The default confidence for ontology-based mappings +DEFAULT_ONTOLOGY_CONFIDENCE = 0.9 + def _safe_get_version(prefix: str) -> str | None: """Get a version from Bioversions, or return None if not possible.""" @@ -99,7 +106,7 @@ def _from_pyobo_pair( :param source_prefix: The prefix of the ontology :param target_prefix: The prefix of the target :param predicate: The predicate of the mappings. Defaults to :data:`DB_XREF`. - :param confidence: The confidence level for the mappings. Defaults to 0.9 + :param confidence: The confidence level for the mappings. Defaults to :data:`DEFAULT_ONTOLOGY_CONFIDENCE`. :param standardize: Should the local unique identifiers in the first and third columns be standardized using :func:`bioregistry.standardize_identifier`? Defaults to false. @@ -151,7 +158,7 @@ def from_cache_df( :param source_prefix: The prefix of the ontology :param prefixes: A set of prefixes to subset the second column of cross-reference targets :param predicate: The predicate of the mappings. Defaults to :data:`DB_XREF`. - :param confidence: The confidence level for the mappings. Defaults to 0.9 + :param confidence: The confidence level for the mappings. Defaults to :data:`DEFAULT_ONTOLOGY_CONFIDENCE` :param standardize: Should the local unique identifiers in the first and third columns be standardized using :func:`bioregistry.standardize_identifier`? Defaults to false. @@ -205,7 +212,7 @@ def _from_pyobo_df( :param source_prefix: The prefix of the ontology :param prefixes: A set of prefixes to subset the second column of cross-reference targets :param predicate: The predicate of the mappings. Defaults to :data:`DB_XREF`. - :param confidence: The confidence level for the mappings. Defaults to 0.9 + :param confidence: The confidence level for the mappings. Defaults to :data:`DEFAULT_ONTOLOGY_CONFIDENCE` :param standardize: Should the local unique identifiers in the first and third columns be standardized using :func:`bioregistry.standardize_identifier`? Defaults to false. @@ -226,7 +233,7 @@ def _from_pyobo_df( if justification is None: justification = UNSPECIFIED_MAPPING if confidence is None: - confidence = 0.9 + confidence = DEFAULT_ONTOLOGY_CONFIDENCE if license is None: license = bioregistry.get_license(source_prefix) if isinstance(prefixes, str): @@ -283,8 +290,10 @@ def from_pyobo( return _from_pyobo_prefix(prefix, standardize=standardize, **kwargs) -def from_bioontologies(prefix: str, confidence=None, **kwargs) -> list[Mapping]: +def from_bioontologies(prefix: str, confidence: float | None = None, **kwargs) -> list[Mapping]: """Get mappings from a given ontology via :mod:`bioontologies`.""" + if confidence is None: + confidence = DEFAULT_ONTOLOGY_CONFIDENCE o = bioontologies.get_obograph_by_prefix(prefix, **kwargs) g = o.guess(prefix) # note that we don't extract stuff from edges so just node standardization is good enough @@ -350,6 +359,8 @@ def _parse_sssom_row( author = None if "mapping_set_name" in row and pd.notna(row["mapping_set_name"]): n = row["mapping_set_name"] + elif "mapping_set" in row and pd.notna(row["mapping_set"]): + n = row["mapping_set"] elif mapping_set_name is None: raise KeyError("need a mapping set name") else: @@ -417,8 +428,9 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat ] df = pd.DataFrame(rows, columns=columns) if add_labels: - for label_column, id_column in [("subject_label", "subject_id"), ("object_label", "object_id")]: - df[label_column] = df[id_column].map(_get_name_by_curie) # type:ignore + with logging_redirect_tqdm(): + for label_column, id_column in [("subject_label", "subject_id"), ("object_label", "object_id")]: + df[label_column] = df[id_column].map(_get_name_by_curie) # type:ignore df = df[ [ "subject_id", @@ -469,8 +481,11 @@ def get_orcid_name(orcid: str) -> Optional[str]: if orcid.startswith("orcid:"): orcid = orcid[len("orcid:") :] - res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}, timeout=5).json() - name = res["person"]["name"] + try: + res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}, timeout=5).json() + except IOError: # e.g., ReadTimeout + return None + name = res.get("person", {}).get("name") if name is None: return None if credit_name := name.get("credit-name"): @@ -543,7 +558,7 @@ def _neo4j_bool(b: bool, /) -> Literal["true", "false"]: # noqa:FBT001 return "true" if b else "false" # type:ignore -def _safe_confidence(x) -> str: +def _safe_confidence(x: Evidence) -> str: confidence = x.get_confidence() if confidence is None: return "" @@ -559,6 +574,7 @@ def write_neo4j( add_labels: bool = False, startup_script_name: str = "startup.sh", run_script_name: str = "run_on_docker.sh", + sort: bool = False, ) -> None: """Write all files needed to construct a Neo4j graph database from a set of mappings. @@ -582,6 +598,7 @@ def write_neo4j( :param startup_script_name: The name of the startup script that the Dockerfile calls :param run_script_name: The name of the run script that you as the user should call to wrap building and running the Docker image + :param sort: Should the output nodes files be sorted? :raises NotADirectoryError: If the directory given does not already exist. It's suggested to use :mod:`pystow` to create deterministic directories. @@ -612,16 +629,15 @@ def write_neo4j( run_path = directory.joinpath(run_script_name) docker_path = directory.joinpath("Dockerfile") - concept_nodes_path = directory.joinpath("concept_nodes.tsv") + concept_nodes_path = directory.joinpath("concept_nodes.tsv.gz") concepts: set[Reference] = set() - concept_nodes_header = ["curie:ID", ":LABEL", "prefix", "name", "priority:boolean"] + concept_nodes_header = ["curie:ID", "prefix", "name", "priority:boolean"] if equivalence_classes is None: equivalence_classes = {} - mapping_nodes_path = directory.joinpath("mapping_nodes.tsv") + mapping_nodes_path = directory.joinpath("mapping_nodes.tsv.gz") mapping_nodes_header = [ "curie:ID", - ":LABEL", "prefix", "predicate", "confidence", @@ -630,22 +646,20 @@ def write_neo4j( "tertiary:boolean", ] - evidence_nodes_path = directory.joinpath("evidence_nodes.tsv") + evidence_nodes_path = directory.joinpath("evidence_nodes.tsv.gz") evidences = {} evidence_nodes_header = [ "curie:ID", - ":LABEL", "prefix", "type", "mapping_justification", "confidence:float", ] - mapping_set_nodes_path = directory.joinpath("mapping_set_nodes.tsv") + mapping_set_nodes_path = directory.joinpath("mapping_set_nodes.tsv.gz") mapping_sets = {} mapping_set_nodes_header = [ "curie:ID", - ":LABEL", "prefix", "name", "license", @@ -653,8 +667,7 @@ def write_neo4j( "confidence:float", ] - edges_path = directory.joinpath("edges.tsv") - edges: list[tuple[str, str, str, str | float, str, str, str, str]] = [] + mapping_edges_path = directory.joinpath("mapping_edges.tsv.gz") edges_header = [ ":START_ID", ":TYPE", @@ -665,65 +678,80 @@ def write_neo4j( "tertiary:boolean", "mapping_sets:string[]", ] - - for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"): - concepts.add(mapping.s) - concepts.add(mapping.o) - - edges.append( - ( - mapping.s.curie, - mapping.p.curie, - mapping.o.curie, - _safe_confidence(mapping), - _neo4j_bool(mapping.has_primary), - _neo4j_bool(mapping.has_secondary), - _neo4j_bool(mapping.has_tertiary), - "|".join(sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set})), + edges_path = directory.joinpath("edges.tsv.gz") + edges_supp_header = [ + ":START_ID", + ":TYPE", + ":END_ID", + ] + with gzip.open(mapping_edges_path, "wt") as file1, gzip.open(edges_path, "wt") as file2: + mapping_writer = csv.writer(file1, delimiter="\t") + mapping_writer.writerow(edges_header) + + edge_writer = csv.writer(file2, delimiter="\t") + edge_writer.writerow(edges_supp_header) + + for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"): + concepts.add(mapping.s) + concepts.add(mapping.o) + + mapping_writer.writerow( + ( + mapping.s.curie, + mapping.p.curie, + mapping.o.curie, + _safe_confidence(mapping), + _neo4j_bool(mapping.has_primary), + _neo4j_bool(mapping.has_secondary), + _neo4j_bool(mapping.has_tertiary), + "|".join( + sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set}) + ), + ) ) - ) - edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "", "", "", "", "")) - edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, "", "", "", "", "")) - for evidence in mapping.evidence: - edges.append((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie, "", "", "", "", "")) - evidences[evidence.key()] = evidence - if evidence.mapping_set: - mapping_sets[evidence.mapping_set.name] = evidence.mapping_set - edges.append((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie, "", "", "", "", "")) - elif isinstance(evidence, ReasonedEvidence): - for mmm in evidence.mappings: - edges.append((evidence.curie, DERIVED_PREDICATE, mmm.curie, "", "", "", "", "")) - # elif isinstance(evidence, SimpleEvidence): - # pass - # else: - # raise TypeError - - # Add authorship information for the evidence, if available - if evidence.author: - concepts.add(evidence.author) - edges.append((evidence.curie, "hasAuthor", evidence.author.curie, "", "", "", "", "")) - - _write_tsv( + edge_writer.writerow((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie)) + edge_writer.writerow((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie)) + for evidence in mapping.evidence: + edge_writer.writerow((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie)) + evidences[evidence.key()] = evidence + if evidence.mapping_set: + mapping_sets[evidence.mapping_set.name] = evidence.mapping_set + edge_writer.writerow((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie)) + elif isinstance(evidence, ReasonedEvidence): + for mmm in evidence.mappings: + edge_writer.writerow((evidence.curie, DERIVED_PREDICATE, mmm.curie)) + # elif isinstance(evidence, SimpleEvidence): + # pass + # else: + # raise TypeError + + # Add authorship information for the evidence, if available + if evidence.author: + concepts.add(evidence.author) + edge_writer.writerow((evidence.curie, HAS_AUTHOR_PREDICATE, evidence.author.curie)) + + sorted_concepts = sorted(concepts, key=lambda n: n.curie) if sort else list(concepts) + _write_tsv_gz( concept_nodes_path, concept_nodes_header, ( ( concept.curie, - "concept", concept.prefix, _get_name_by_curie(concept.curie) or "" if add_labels else "", _neo4j_bool(equivalence_classes.get(concept, False)), ) - for concept in sorted(concepts, key=lambda n: n.curie) + for concept in tqdm(sorted_concepts, desc="writing concept nodes", unit_scale=True, unit="concept") ), ) - _write_tsv( + + sorted_mappings = sorted(mappings, key=lambda n: n.curie) if sort else mappings + _write_tsv_gz( mapping_nodes_path, mapping_nodes_header, ( ( mapping.curie, - "mapping", "semra.mapping", mapping.p.curie, _safe_confidence(mapping), @@ -731,41 +759,44 @@ def write_neo4j( _neo4j_bool(mapping.has_secondary), _neo4j_bool(mapping.has_tertiary), ) - for mapping in sorted(mappings, key=lambda n: n.curie) + for mapping in tqdm(sorted_mappings, desc="writing mapping nodes", unit_scale=True, unit="mapping") ), ) - _write_tsv( + + sorted_mapping_sets = sorted(mapping_sets.values(), key=lambda n: n.curie) if sort else list(mapping_sets.values()) + _write_tsv_gz( mapping_set_nodes_path, mapping_set_nodes_header, ( ( mapping_set.curie, - "mappingset", "semra.mappingset", mapping_set.name, mapping_set.license or "", mapping_set.version or "", _safe_confidence(mapping_set), ) - for mapping_set in sorted(mapping_sets.values(), key=lambda n: n.curie) + for mapping_set in sorted_mapping_sets ), ) - _write_tsv( + + sorted_evidences = sorted(evidences.values(), key=lambda row: row.curie) if sort else list(evidences.values()) + _write_tsv_gz( evidence_nodes_path, evidence_nodes_header, ( ( evidence.curie, - "evidence", "semra.evidence", evidence.evidence_type, evidence.justification.curie, _safe_confidence(evidence), ) - for evidence in sorted(evidences.values(), key=lambda row: row.curie) + for evidence in tqdm( + sorted_evidences, desc="Writing evidence nodes", leave=False, unit_scale=True, unit="evidence" + ) ), ) - _write_tsv(edges_path, edges_header, sorted(set(edges), key=_edge_key)) startup_commands = dedent( """\ @@ -773,9 +804,7 @@ def write_neo4j( neo4j start # Get the port - until [ \ - "$(curl -s -w '%{http_code}' -o /dev/null "http://localhost:7474")" \ - -eq 200 ] + until [ "$(curl -s -w '%{http_code}' -o /dev/null "http://localhost:7474")" -eq 200 ] do sleep 5 done @@ -805,26 +834,33 @@ def write_neo4j( apt-get install -y git zip unzip bzip2 gcc pkg-config python3.11 && \\ curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 - ARG twiddle1=dee RUN python3.11 -m pip install "semra[web] @ git+https://github.com/biopragmatics/semra.git" # Add graph content - ARG twiddle2=dee - COPY concept_nodes.tsv /sw/concept_nodes.tsv - COPY mapping_nodes.tsv /sw/mapping_nodes.tsv - COPY evidence_nodes.tsv /sw/evidence_nodes.tsv - COPY mapping_set_nodes.tsv /sw/mapping_set_nodes.tsv - COPY edges.tsv /sw/edges.tsv + COPY concept_nodes.tsv.gz /sw/concept_nodes.tsv.gz + COPY mapping_nodes.tsv.gz /sw/mapping_nodes.tsv.gz + COPY evidence_nodes.tsv.gz /sw/evidence_nodes.tsv.gz + COPY mapping_set_nodes.tsv.gz /sw/mapping_set_nodes.tsv.gz + COPY mapping_edges.tsv.gz /sw/mapping_edges.tsv.gz + COPY edges.tsv.gz /sw/edges.tsv.gz # Ingest graph content into neo4j RUN sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/neo4j/neo4j.conf && \\ sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \\ neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true \\ + --relationships /sw/mapping_edges.tsv \\ --relationships /sw/edges.tsv \\ - --nodes /sw/concept_nodes.tsv \\ - --nodes /sw/mapping_nodes.tsv \\ - --nodes /sw/mapping_set_nodes.tsv \\ - --nodes /sw/evidence_nodes.tsv + --nodes=concept=/sw/concept_nodes.tsv \\ + --nodes=mapping=/sw/mapping_nodes.tsv \\ + --nodes=mappingset=/sw/mapping_set_nodes.tsv \\ + --nodes=evidence=/sw/evidence_nodes.tsv + + RUN rm /sw/concept_nodes.tsv.gz + RUN rm /sw/mapping_nodes.tsv.gz + RUN rm /sw/evidence_nodes.tsv.gz + RUN rm /sw/mapping_set_nodes.tsv.gz + RUN rm /sw/edges.tsv.gz + RUN rm /sw/mapping_edges.tsv.gz COPY startup.sh startup.sh ENTRYPOINT ["/bin/bash", "/sw/startup.sh"] @@ -860,9 +896,9 @@ def write_neo4j( # command_path.write_text(shell_command) -def _write_tsv(path, header, rows) -> None: +def _write_tsv_gz(path, header, rows) -> None: click.echo(f"writing to {path}") - with path.open("w") as file: - print(*header, sep="\t", file=file) # noqa:T201 - for row in rows: - print(*row, sep="\t", file=file) # noqa:T201 + with gzip.open(path, "wt") as file: + writer = csv.writer(file, delimiter="\t") + writer.writerow(header) + writer.writerows(rows) diff --git a/src/semra/landscape/anatomy.py b/src/semra/landscape/anatomy.py index 47f889f..5cdbe53 100644 --- a/src/semra/landscape/anatomy.py +++ b/src/semra/landscape/anatomy.py @@ -5,6 +5,7 @@ from pyobo.sources.mesh import get_mesh_category_curies import semra +from semra.pipeline import CREATOR_CHARLIE __all__ = [ "MODULE", @@ -32,7 +33,9 @@ } CONFIGURATION = semra.Configuration( - name="Anatomy mappings", + name="SeMRA Anatomy Mappings Database", + description="Supports the analysis of the landscape of anatomy nomenclature resources.", + creators=[CREATOR_CHARLIE], inputs=[ semra.Input(source="biomappings"), semra.Input(source="gilda"), @@ -64,6 +67,8 @@ processed_neo4j_name="semra-anatomy", priority_pickle_path=MODULE.join(name="priority.pkl"), priority_sssom_path=MODULE.join(name="priority.sssom.tsv"), + configuration_path=MODULE.join(name="configuration.json"), + zenodo_record=11091803, ) @@ -71,6 +76,7 @@ def main(): """Build the mapping database for anatomical terms.""" CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True) + CONFIGURATION.upload_zenodo() if __name__ == "__main__": diff --git a/src/semra/landscape/cells.py b/src/semra/landscape/cells.py index d0fef7f..8c48738 100644 --- a/src/semra/landscape/cells.py +++ b/src/semra/landscape/cells.py @@ -20,7 +20,7 @@ from semra.api import project, str_source_target_counts from semra.io import write_sssom -from semra.pipeline import Configuration, Input, Mutation, get_mappings_from_config +from semra.pipeline import CREATOR_CHARLIE, Configuration, Input, Mutation, get_mappings_from_config __all__ = [ "MODULE", @@ -50,9 +50,10 @@ } CONFIGURATION = Configuration( - name="Cell and Cell Line Mappings", + name="SeMRA Cell and Cell Line Mappings Database", description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario posed in the Biomappings paper, " "this configuration imports several different cell and cell line resources and identifies mappings between them.", + creators=[CREATOR_CHARLIE], inputs=[ Input(source="biomappings"), Input(source="gilda"), @@ -96,6 +97,8 @@ processed_neo4j_name="semra-cell", priority_pickle_path=MODULE.join(name="priority.pkl"), priority_sssom_path=MODULE.join(name="priority.sssom.tsv"), + configuration_path=MODULE.join(name="configuration.json"), + zenodo_record=11091581, ) @@ -103,6 +106,7 @@ def main(): """Build the mapping database for cell and cell line terms.""" mappings = get_mappings_from_config(CONFIGURATION, refresh_raw=True, refresh_processed=True) + CONFIGURATION.upload_zenodo() click.echo(f"Processing returned {len(mappings):,} mappings") click.echo(str_source_target_counts(mappings)) diff --git a/src/semra/landscape/complexes.py b/src/semra/landscape/complexes.py index 2724b52..0977ddd 100644 --- a/src/semra/landscape/complexes.py +++ b/src/semra/landscape/complexes.py @@ -3,7 +3,8 @@ import click import pystow -from semra.pipeline import Configuration, Input, Mutation +from semra.pipeline import Configuration, Creator, Input, Mutation +from semra.rules import CHARLIE_NAME, CHARLIE_ORCID __all__ = [ "MODULE", @@ -27,8 +28,9 @@ } CONFIGURATION = Configuration( - name="Protein Complex Landscape Analysis", + name="SeMRA Protein Complex Landscape Analysis", description="Analyze the landscape of protein complex nomenclature resources, species-agnostic.", + creators=[Creator(orcid=CHARLIE_ORCID.identifier, name=CHARLIE_NAME)], inputs=[ Input(source="gilda"), Input(source="biomappings"), @@ -51,20 +53,25 @@ ], raw_pickle_path=MODULE.join(name="raw.pkl"), raw_sssom_path=MODULE.join(name="raw.sssom.tsv"), - # raw_neo4j_path=MODULE.join("neo4j_raw"), + raw_neo4j_path=MODULE.join("neo4j_raw"), + raw_neo4j_name="semra-complex", processed_pickle_path=MODULE.join(name="processed.pkl"), processed_sssom_path=MODULE.join(name="processed.sssom.tsv"), processed_neo4j_path=MODULE.join("neo4j"), processed_neo4j_name="semra-complex", priority_pickle_path=MODULE.join(name="priority.pkl"), priority_sssom_path=MODULE.join(name="priority.sssom.tsv"), + configuration_path=MODULE.join(name="configuration.json"), + zenodo_record=11091422, ) @click.command() def main(): """Build the mapping database for protein complex terms.""" - CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True) + CONFIGURATION.get_mappings(refresh_raw=False, refresh_processed=False) + res = CONFIGURATION.upload_zenodo() + click.echo(res.json()["links"]["html"]) if __name__ == "__main__": diff --git a/src/semra/landscape/diseases.py b/src/semra/landscape/diseases.py index 33ae4d5..642cd1d 100644 --- a/src/semra/landscape/diseases.py +++ b/src/semra/landscape/diseases.py @@ -5,7 +5,7 @@ import pystow from pyobo.sources.mesh import get_mesh_category_curies -from semra.pipeline import Configuration, Input, Mutation +from semra.pipeline import CREATOR_CHARLIE, Configuration, Input, Mutation __all__ = [ "MODULE", @@ -45,8 +45,9 @@ } CONFIGURATION = Configuration( - name="Disease Landscape Analysis", - description="", + name="SeMRA Disease Mappings Database", + description="Supports the analysis of the landscape of disease nomenclature resources.", + creators=[CREATOR_CHARLIE], inputs=[ Input(source="biomappings"), Input(source="gilda"), @@ -84,6 +85,8 @@ processed_neo4j_name="semra-disease", priority_pickle_path=MODULE.join(name="priority.pkl"), priority_sssom_path=MODULE.join(name="priority.sssom.tsv"), + configuration_path=MODULE.join(name="configuration.json"), + zenodo_record=11091886, ) @@ -92,6 +95,7 @@ def main(): """Build the mapping database for disease terms.""" # Takes about 2 hours CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True) + CONFIGURATION.upload_zenodo() if __name__ == "__main__": diff --git a/src/semra/landscape/genes.py b/src/semra/landscape/genes.py index 6e75cc8..0b4e3f0 100644 --- a/src/semra/landscape/genes.py +++ b/src/semra/landscape/genes.py @@ -3,7 +3,7 @@ import click import pystow -from semra.pipeline import Configuration, Input, Mutation +from semra.pipeline import CREATOR_CHARLIE, Configuration, Input, Mutation __all__ = [ "MODULE", @@ -30,8 +30,9 @@ ] CONFIGURATION = Configuration( - name="Gene Landscape Analysis", + name="SeMRA Gene Mapping Database", description="Analyze the landscape of gene nomenclature resources, species-agnostic.", + creators=[CREATOR_CHARLIE], inputs=[ Input(prefix="hgnc", source="pyobo", confidence=0.99), Input(prefix="mgi", source="pyobo", confidence=0.99), @@ -58,15 +59,17 @@ Mutation(source="umls", confidence=0.8), Mutation(source="ncit", confidence=0.8), ], - raw_pickle_path=MODULE.join(name="raw.pkl"), - raw_sssom_path=MODULE.join(name="raw.sssom.tsv"), + raw_pickle_path=MODULE.join(name="raw.pkl.gz"), + raw_sssom_path=MODULE.join(name="raw.sssom.tsv.gz"), # raw_neo4j_path=MODULE.join("neo4j_raw"), - processed_pickle_path=MODULE.join(name="processed.pkl"), - processed_sssom_path=MODULE.join(name="processed.sssom.tsv"), + processed_pickle_path=MODULE.join(name="processed.pkl.gz"), + processed_sssom_path=MODULE.join(name="processed.sssom.tsv.gz"), processed_neo4j_path=MODULE.join("neo4j"), processed_neo4j_name="semra-gene", - priority_pickle_path=MODULE.join(name="priority.pkl"), - priority_sssom_path=MODULE.join(name="priority.sssom.tsv"), + priority_pickle_path=MODULE.join(name="priority.pkl.gz"), + priority_sssom_path=MODULE.join(name="priority.sssom.tsv.gz"), + configuration_path=MODULE.join(name="configuration.json"), + zenodo_record=11092013, ) @@ -74,6 +77,7 @@ def main(): """Build the mapping database for gene terms.""" CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True) + CONFIGURATION.upload_zenodo() if __name__ == "__main__": diff --git a/src/semra/landscape/utils.py b/src/semra/landscape/utils.py index 4127d2f..9889a39 100644 --- a/src/semra/landscape/utils.py +++ b/src/semra/landscape/utils.py @@ -68,6 +68,9 @@ def notebook( if output_directory is None: output_directory = configuration.raw_pickle_path.parent output_directory = Path(output_directory).expanduser().resolve() + configuration_path = output_directory.joinpath("configuration.json") + configuration_path.write_text(configuration.model_dump_json(indent=2, exclude_none=True, exclude_unset=True)) + terms = get_terms(configuration.priority, configuration.subsets) hydrated_subsets = configuration.get_hydrated_subsets() diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index 7c9514d..3990e9c 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -8,6 +8,7 @@ from pathlib import Path from typing import Any, Literal, Optional +import requests from pydantic import BaseModel, Field, root_validator from tqdm.auto import tqdm @@ -35,7 +36,7 @@ write_pickle, write_sssom, ) -from semra.rules import DB_XREF, EXACT_MATCH, IMPRECISE +from semra.rules import CHARLIE_NAME, CHARLIE_ORCID, DB_XREF, EXACT_MATCH, IMPRECISE from semra.sources import SOURCE_RESOLVER from semra.sources.biopragmatics import ( from_biomappings_negative, @@ -46,9 +47,13 @@ from semra.sources.wikidata import get_wikidata_mappings_by_prefix from semra.struct import Mapping, Reference +if t.TYPE_CHECKING: + import zenodo_client + __all__ = [ # Configuration model "Configuration", + "Creator", "SubsetConfiguration", "Input", "Mutation", @@ -82,6 +87,16 @@ class Mutation(BaseModel): SubsetConfiguration = t.Mapping[str, t.Collection[str]] +class Creator(BaseModel): + """A model describing a creator.""" + + name: str + orcid: str + + +CREATOR_CHARLIE = Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier) + + class Configuration(BaseModel): """Represents the steps taken during mapping assembly.""" @@ -89,6 +104,7 @@ class Configuration(BaseModel): description: Optional[str] = Field( None, description="An explanation of the purpose of the mapping set configuration" ) + creators: t.List[Creator] = Field(default_factory=list, description="A list of the ORCID identifiers for creators") inputs: t.List[Input] = Field(..., description="A list of sources of mappings") negative_inputs: t.List[Input] = Field(default=[Input(source="biomappings", prefix="negative")]) priority: t.List[str] = Field( @@ -97,9 +113,9 @@ class Configuration(BaseModel): mutations: t.List[Mutation] = Field(default_factory=list) subsets: t.Optional[t.Mapping[str, t.List[str]]] = Field( None, - description="A field to put restrictions on the subhierarchies from each resource. For example, if " + description="A field to put restrictions on the sub-hierarchies from each resource. For example, if " "you want to assemble cell mappings from MeSH, you don't need all possible mesh mappings, but only " - "ones that have to do with terms in the cell hierchy under the mesh:D002477 term. Therefore, this " + "ones that have to do with terms in the cell hierarchy under the mesh:D002477 term. Therefore, this " "dictionary allows for specifying such restrictions", examples=[ {"mesh": ["mesh:D002477"]}, @@ -138,6 +154,10 @@ class Configuration(BaseModel): add_labels: bool = Field(default=False, description="Should PyOBO be used to look up labels for SSSOM output?") + configuration_path: Optional[Path] = Field(None, description="The path where this configuration should be written.") + + zenodo_record: Optional[int] = Field(None, description="The Zenodo record identifier") + @root_validator(skip_on_failure=True) def infer_priority(cls, values): # noqa:N805 """Infer the priority from the input list of not given.""" @@ -146,6 +166,12 @@ def infer_priority(cls, values): # noqa:N805 values["priority"] = [inp.prefix for inp in values["inputs"].inputs if inp.prefix is not None] return values + def zenodo_url(self) -> t.Optional[str]: + """Get the zenodo URL, if available.""" + if self.zenodo_record is None: + return None + return f"https://bioregistry.io/zenodo.record:{self.zenodo_record}" + @classmethod def from_prefixes( cls, *, name: str, prefixes: t.Iterable[str], include_biomappings: bool = True, include_gilda: bool = True @@ -189,6 +215,80 @@ def get_hydrated_subsets(self) -> t.Mapping[str, t.Collection[str]]: return {} return hydrate_subsets(self.subsets) + def _get_zenodo_metadata(self) -> "zenodo_client.Metadata": + if not self.creators: + raise ValueError("Creating a Zenodo record requires annotating the creators field") + import zenodo_client + + if self.name is None: + raise ValueError("name must be given to upload to zenodo") + if self.description is None: + raise ValueError("description must be given to upload to zenodo") + if not self.creators: + raise ValueError("at least one creator must be given to upload to zenodo") + + return zenodo_client.Metadata( + upload_type="dataset", + title=self.name, + description=self.description, + creators=[zenodo_client.Creator(name=creator.name, orcid=creator.orcid) for creator in self.creators], + ) + + def _get_zenodo_paths(self, *, processed: bool = True) -> t.List[Path]: + if self.configuration_path is not None and not self.configuration_path.is_file(): + self.configuration_path.write_text(self.model_dump_json(indent=2, exclude_none=True, exclude_unset=True)) + paths = [ + self.configuration_path, + self.raw_sssom_path, + self.raw_pickle_path, + self.processed_sssom_path, + self.processed_pickle_path, + self.priority_sssom_path, + self.processed_pickle_path, + ] + for path in paths: + if path is None: + raise ValueError("Can't upload to Zenodo if not all output paths are configured") + if not path.is_file(): + raise FileNotFoundError(path) + if processed and self.processed_neo4j_path is not None and self.processed_neo4j_path.is_dir(): + paths.extend(self.processed_neo4j_path.iterdir()) + elif self.raw_neo4j_path is not None and self.raw_neo4j_path.is_dir(): + paths.extend(self.raw_neo4j_path.iterdir()) + else: + logger.debug("Not uploading neo4j") + return t.cast(t.List[Path], paths) + + def ensure_zenodo( + self, key: str, *, metadata: t.Optional["zenodo_client.Metadata"] = None, processed: bool = True, **kwargs + ) -> requests.Response: + """Ensure a zenodo record.""" + if self.zenodo_record is not None: + raise ValueError( + f"Refusing to create new Zenodo record since it already exists: " + f"https://bioregistry.io/zenodo.record:{self.zenodo_record}.\n\n" + f"Use `Configuration.upload_zenodo(processed={processed})` instead." + ) + + from zenodo_client import ensure_zenodo + + paths = self._get_zenodo_paths(processed=processed) + res = ensure_zenodo(key=key, data=metadata or self._get_zenodo_metadata(), paths=paths, **kwargs) + return res + + def upload_zenodo(self, processed: bool = True, **kwargs) -> requests.Response: + """Upload a Zenodo record.""" + if not self.zenodo_record: + raise ValueError( + "Can not upload to zenodo if no record is configured.\n\n" + f"Use `Configuration.ensure_zenodo(key=..., processed={processed})` instead." + ) + from zenodo_client import update_zenodo + + paths = self._get_zenodo_paths(processed=processed) + res = update_zenodo(str(self.zenodo_record), paths=paths, **kwargs) + return res + def get_mappings_from_config( configuration: Configuration, @@ -214,6 +314,11 @@ def get_mappings_from_config( "loaded cached raw mappings from %s in %.2f seconds", configuration.raw_pickle_path, time.time() - start ) else: + if configuration.configuration_path is not None: + configuration.configuration_path.write_text( + configuration.model_dump_json(exclude_none=True, exclude_unset=True, indent=2) + ) + raw_mappings = get_raw_mappings(configuration) if configuration.validate_raw: validate_mappings(raw_mappings) @@ -374,7 +479,7 @@ def process( mappings = infer_mutual_dbxref_mutations(mappings, upgrade_prefixes, confidence=0.95) _log_diff(before, mappings, verb="Inferred upgrades", elapsed=time.time() - start) - # remove dbxrefs + # remove database cross-references if remove_imprecise: logger.info("Removing unqualified database xrefs") before = len(mappings) @@ -382,7 +487,7 @@ def process( mappings = [m for m in mappings if m.p not in IMPRECISE] _log_diff(before, mappings, verb="Filtered non-precise", elapsed=time.time() - start) - # 3. Inference based on adding reverse relations then doing multi-chain hopping + # 3. Inference based on adding reverse relations then doing multichain hopping logger.info("Inferring reverse mappings") before = len(mappings) start = time.time() diff --git a/src/semra/rules.py b/src/semra/rules.py index 1d44c18..49d94fe 100644 --- a/src/semra/rules.py +++ b/src/semra/rules.py @@ -2,7 +2,7 @@ from __future__ import annotations -from semra.struct import Reference +from curies import Reference EXACT_MATCH = Reference(prefix="skos", identifier="exactMatch") BROAD_MATCH = Reference(prefix="skos", identifier="broadMatch") @@ -44,4 +44,5 @@ KNOWLEDGE_MAPPING = Reference.from_curie("semapv:BackgroundKnowledgeBasedMatching") CHARLIE_ORCID = Reference.from_curie("orcid:0000-0003-4423-4370") +CHARLIE_NAME = "Charles Tapley Hoyt" BEN_ORCID = Reference.from_curie("orcid:0000-0001-9439-5346") diff --git a/src/semra/sources/chembl.py b/src/semra/sources/chembl.py index 13ebd4e..6ed6f86 100644 --- a/src/semra/sources/chembl.py +++ b/src/semra/sources/chembl.py @@ -54,9 +54,9 @@ def get_chembl_protein_mappings(version: Optional[str] = None) -> list[Mapping]: df = chembl_downloader.get_uniprot_mapping_df(version=version) return [ Mapping( - s=Reference(prefix="uniprot", identifier=uniprot), + s=Reference(prefix="chembl.target", identifier=chembl_id), p=EXACT_MATCH, - o=Reference(prefix="chembl.target", identifier=chembl_id), + o=Reference(prefix="uniprot", identifier=uniprot), evidence=[ SimpleEvidence( justification=UNSPECIFIED_MAPPING, diff --git a/src/semra/sources/famplex.py b/src/semra/sources/famplex.py index c4512c5..f9701f7 100644 --- a/src/semra/sources/famplex.py +++ b/src/semra/sources/famplex.py @@ -42,7 +42,7 @@ def get_fplx_mappings() -> list[Mapping]: and not (target_prefix == "NXP" and target_id.startswith("FA:")) # is this a problem? ) ] - validate_mappings(rv) + validate_mappings(rv, progress=False) return rv diff --git a/src/semra/sources/pubchem.py b/src/semra/sources/pubchem.py index 551644a..e93beec 100644 --- a/src/semra/sources/pubchem.py +++ b/src/semra/sources/pubchem.py @@ -3,11 +3,11 @@ from __future__ import annotations import logging -from typing import Optional +from typing import Optional, Set import bioversions -import pandas as pd import pyobo +import requests from curies import Reference from semra.rules import EXACT_MATCH, UNSPECIFIED_MAPPING @@ -25,29 +25,26 @@ def get_pubchem_mesh_mappings(version: Optional[str] = None) -> list[Mapping]: """Get a mapping from PubChem compound identifiers to their equivalent MeSH terms.""" if version is None: version = bioversions.get_version("pubchem") - url = f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/CID-MeSH" - df = pd.read_csv( - url, - dtype=str, - header=None, - names=["pubchem", "mesh"], - ) + mesh_name_to_id = pyobo.get_name_id_mapping("mesh") - needs_curation = set() - mesh_ids = [] - for name in df["mesh"]: - mesh_id = mesh_name_to_id.get(name) - if mesh_id is None and name not in needs_curation: - needs_curation.add(name) - logger.debug("[mesh] needs curating: %s", name) - mesh_ids.append(mesh_id) - logger.info("[mesh] %d/%d need updating", len(needs_curation), len(mesh_ids)) - df["mesh"] = mesh_ids - - return [ - Mapping( + needs_curation: Set[str] = set() + + url = f"https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/CID-MeSH" + res = requests.get(url, stream=True) + + rv = [] + for line in res.iter_lines(): + # on a small number of entries, there are multiple names. their impact is negligible + pubchem, mesh_name, *_ = line.decode("utf8").strip().split("\t") + mesh_id = mesh_name_to_id.get(mesh_name) + if mesh_id is None: + if mesh_name not in needs_curation: + needs_curation.add(mesh_name) + logger.debug("[mesh] needs curating: %s", mesh_name) + continue + mapping = Mapping( s=Reference(prefix="pubchem.compound", identifier=pubchem), - o=Reference(prefix="mesh", identifier=mesh), + o=Reference(prefix="mesh", identifier=mesh_id), p=EXACT_MATCH, evidence=[ SimpleEvidence( @@ -57,6 +54,7 @@ def get_pubchem_mesh_mappings(version: Optional[str] = None) -> list[Mapping]: ) ], ) - for pubchem, mesh in df.values - if mesh is not None - ] + rv.append(mapping) + + logger.warning("[pubchem-mesh] %d MeSH names need manual curation", len(needs_curation)) + return rv diff --git a/src/semra/sources/wikidata.py b/src/semra/sources/wikidata.py index 9f18994..b71cd62 100644 --- a/src/semra/sources/wikidata.py +++ b/src/semra/sources/wikidata.py @@ -42,13 +42,19 @@ def _help( if predicate is None: predicate = EXACT_MATCH - mapping_set = MappingSet(name="Wikidata", license="CC0", confidence=0.99) + mapping_set = MappingSet(name="wikidata", license="CC0", confidence=0.99) return [ Mapping( s=Reference(prefix="wikidata", identifier=wikidata_id), p=predicate, - o=Reference(prefix=target_prefix, identifier=xref_id), + o=Reference(prefix=target_prefix, identifier=_clean_xref_id(target_prefix, xref_id)), evidence=[SimpleEvidence(justification=UNSPECIFIED_MAPPING, mapping_set=mapping_set)], ) for wikidata_id, xref_id in iter_wikidata_mappings(prop, cache=cache) ] + + +def _clean_xref_id(prefix: str, identifier: str) -> str: + if identifier.lower().startswith(f"{prefix}_"): + identifier = identifier[len(prefix) + 1 :] + return identifier