From 334444074c061b54c3630e97bd45466f89706f55 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Tue, 17 Oct 2023 00:09:59 +0200 Subject: [PATCH] Add flag for primary evidence to neo4j --- scripts/cancer_cell_reproduction.py | 3 ++ src/semra/__init__.py | 2 +- src/semra/client.py | 4 +-- src/semra/io.py | 44 +++++++++++++++++++++++------ src/semra/struct.py | 5 ++++ src/semra/templates/concept.html | 7 +++-- src/semra/wsgi.py | 19 ++++++++++--- 7 files changed, 66 insertions(+), 18 deletions(-) diff --git a/scripts/cancer_cell_reproduction.py b/scripts/cancer_cell_reproduction.py index c3b4553..7408692 100644 --- a/scripts/cancer_cell_reproduction.py +++ b/scripts/cancer_cell_reproduction.py @@ -75,6 +75,9 @@ remove_imprecise=False, mutations=[ Mutation(source="efo", confidence=0.7), + Mutation(source="bto", confidence=0.7), + Mutation(source="cl", confidence=0.7), + Mutation(source="clo", confidence=0.7), Mutation(source="depmap", confidence=0.7), Mutation(source="ccle", confidence=0.7), Mutation(source="cellosaurus", confidence=0.7), diff --git a/src/semra/__init__.py b/src/semra/__init__.py index 8a6be7e..6ed0832 100644 --- a/src/semra/__init__.py +++ b/src/semra/__init__.py @@ -1,4 +1,4 @@ -from semra.rules import DB_XREF, EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, UNSPECIFIED_MAPPING, REPLACED_BY +from semra.rules import DB_XREF, EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, REPLACED_BY, UNSPECIFIED_MAPPING from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence __all__ = [ diff --git a/src/semra/client.py b/src/semra/client.py index fce5515..d0b76bc 100644 --- a/src/semra/client.py +++ b/src/semra/client.py @@ -220,9 +220,9 @@ def get_highest_exact_matches(self, limit: int = 10) -> Counter: query = "MATCH (a)-[:`skos:exactMatch`]-(b) WHERE a.priority RETURN a.curie, count(distinct b) as c ORDER BY c DESCENDING LIMIT $limit" return Counter(dict(self.read_query(query, limit=limit))) - def get_exact_matches(self, curie: str) -> set[Reference]: + def get_exact_matches(self, curie: str) -> dict[Reference, str]: query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b" - return {Reference.from_curie(node["curie"]) for node, in self.read_query(query, curie=curie)} + return {Reference.from_curie(node["curie"]): node["name"] for node, in self.read_query(query, curie=curie)} def get_concept_name(self, curie: str) -> str: return _get_name_by_curie(curie) diff --git a/src/semra/io.py b/src/semra/io.py index 3a9e665..371ccf1 100644 --- a/src/semra/io.py +++ b/src/semra/io.py @@ -5,7 +5,7 @@ import pickle from pathlib import Path from textwrap import dedent -from typing import TextIO, cast +from typing import Literal, TextIO, cast import bioontologies import bioregistry @@ -14,9 +14,9 @@ import pandas as pd import pyobo import pyobo.utils +from bioregistry import Collection from tqdm.auto import tqdm -from bioregistry import Collection from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence @@ -411,10 +411,14 @@ def from_pickle(path: str | Path) -> list[Mapping]: def _edge_key(t): - s, p, o, c = t + s, p, o, c, *_ = t return s, p, o, 1 if isinstance(c, float) else 0, t +def _neo4j_bool(b: bool, /) -> Literal["true", "false"]: # noqa:FBT001 + return "true" if b else "false" + + def write_neo4j( mappings: list[Mapping], directory: str | Path, @@ -438,7 +442,7 @@ def write_neo4j( equivalence_classes = {} mapping_nodes_path = directory.joinpath("mapping_nodes.tsv") - mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence"] + mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence", "hasPrimary:boolean"] evidence_nodes_path = directory.joinpath("evidence_nodes.tsv") evidences = {} @@ -453,11 +457,25 @@ def write_neo4j( mapping_set_nodes_path = directory.joinpath("mapping_set_nodes.tsv") mapping_sets = {} - mapping_set_nodes_header = ["curie:ID", ":LABEL", "prefix", "name", "license", "version", "confidence:float"] + mapping_set_nodes_header = [ + "curie:ID", + ":LABEL", + "prefix", + "name", + "license", + "version", + "confidence:float", + ] edges_path = directory.joinpath("edges.tsv") edges: list[tuple[str, str, str, str | float]] = [] - edges_header = [":START_ID", ":TYPE", ":END_ID", "confidence:float"] + edges_header = [ + ":START_ID", + ":TYPE", + ":END_ID", + "confidence:float", + "hasPrimary:boolean", + ] for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"): concepts.add(mapping.s) @@ -468,6 +486,7 @@ def write_neo4j( mapping.p.curie, mapping.o.curie, round(c, 4) if (c := mapping.confidence) is not None else "", + _neo4j_bool(mapping.has_primary_evidence), ) ) edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "")) @@ -500,7 +519,7 @@ def write_neo4j( "concept", concept.prefix, pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "", - "true" if equivalence_classes.get(concept, False) else "false", + _neo4j_bool(equivalence_classes.get(concept, False)), ) for concept in sorted(concepts, key=lambda n: n.curie) ), @@ -509,7 +528,14 @@ def write_neo4j( mapping_nodes_path, mapping_nodes_header, ( - (mapping.curie, "mapping", "semra.mapping", mapping.p.curie, mapping.confidence) + ( + mapping.curie, + "mapping", + "semra.mapping", + mapping.p.curie, + mapping.confidence and round(mapping.confidence, 4), + _neo4j_bool(mapping.has_primary_evidence), + ) for mapping in sorted(mappings, key=lambda n: n.curie) ), ) @@ -577,7 +603,7 @@ def write_neo4j( curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 ARG twiddle1=dee - RUN python3.11 -m pip install git+https://github.com/biopragmatics/semra.git#egg=semra[web] + RUN python3.11 -m pip install "semra[web] @ git+https://github.com/biopragmatics/semra.git" # Add graph content ARG twiddle2=dee diff --git a/src/semra/struct.py b/src/semra/struct.py index 46f660c..8231492 100644 --- a/src/semra/struct.py +++ b/src/semra/struct.py @@ -225,6 +225,11 @@ def get_reference(self) -> Reference: def curie(self) -> str: return self.get_reference().curie + @property + def has_primary_evidence(self) -> bool: + """Get if there is a primary evidence associated with this mapping.""" + return any(isinstance(evidence, SimpleEvidence) for evidence in self.evidence) + def line(*references: Reference) -> list[Mapping]: """Create a list of mappings from a simple mappings path.""" diff --git a/src/semra/templates/concept.html b/src/semra/templates/concept.html index e071473..274f55f 100644 --- a/src/semra/templates/concept.html +++ b/src/semra/templates/concept.html @@ -12,17 +12,20 @@
- {{ curie }} + {{ name }} {{ curie }}
Exact Matches
- {% for exact_match in exact_matches | sort(attribute='curie') %} + {% for exact_match, name in exact_matches.items() %} + diff --git a/src/semra/wsgi.py b/src/semra/wsgi.py index feb8a26..7d61c03 100644 --- a/src/semra/wsgi.py +++ b/src/semra/wsgi.py @@ -2,8 +2,6 @@ import os -import biomappings.resources -import biomappings.utils import fastapi import flask from curies import Reference @@ -15,6 +13,11 @@ from semra import Evidence, Mapping, MappingSet from semra.client import Neo4jClient +try: + import biomappings.utils as biomappings_utils +except ImportError: + biomappings_utils = None + client = Neo4jClient() api_router = fastapi.APIRouter() @@ -29,7 +32,7 @@ api_router.mount("/", WSGIMiddleware(flask_app)) EXAMPLE_MAPPINGS = ["25b67912bc720127a43a06ce4688b672", "5a56bf7ac409d8de84c3382a99e17715"] -BIOMAPPINGS_GIT_HASH = biomappings.utils.get_git_hash() +BIOMAPPINGS_GIT_HASH = biomappings_utils is not None and biomappings_utils.get_git_hash() PREDICATE_COUNTER = client.summarize_predicates() MAPPING_SET_COUNTER = client.summarize_mapping_sets() @@ -93,11 +96,17 @@ def view_mapping(curie: str): def view_concept(curie: str): """View a concept.""" reference = Reference.from_curie(curie) + name = client.get_concept_name(curie) exact_matches = client.get_exact_matches(curie) # TODO when showing equivalence between two entities from same namespace, suggest curating a replaced by relation return render_template( - "concept.html", reference=reference, curie=curie, exact_matches=exact_matches, has_biomappings=BIOMAPPINGS_GIT_HASH is not None + "concept.html", + reference=reference, + curie=curie, + name=name, + exact_matches=exact_matches, + has_biomappings=BIOMAPPINGS_GIT_HASH is not None, ) @@ -110,6 +119,8 @@ def mark_exact_incorrect(source: str, target: str): flask.flash("Can't interact with biomappings", category="error") return flask.redirect(flask.url_for(view_concept.__name__, curie=source)) + import biomappings.resources + source_reference = Reference.from_curie(source) target_reference = Reference.from_curie(target)
{{ exact_match.curie }} + {{ name }} + SeMRA