Skip to content

Commit

Permalink
Add flag for primary evidence to neo4j
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Oct 16, 2023
1 parent 3b9bb6c commit 3344440
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 18 deletions.
3 changes: 3 additions & 0 deletions scripts/cancer_cell_reproduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@
remove_imprecise=False,
mutations=[
Mutation(source="efo", confidence=0.7),
Mutation(source="bto", confidence=0.7),
Mutation(source="cl", confidence=0.7),
Mutation(source="clo", confidence=0.7),
Mutation(source="depmap", confidence=0.7),
Mutation(source="ccle", confidence=0.7),
Mutation(source="cellosaurus", confidence=0.7),
Expand Down
2 changes: 1 addition & 1 deletion src/semra/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from semra.rules import DB_XREF, EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, UNSPECIFIED_MAPPING, REPLACED_BY
from semra.rules import DB_XREF, EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, REPLACED_BY, UNSPECIFIED_MAPPING
from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence

__all__ = [
Expand Down
4 changes: 2 additions & 2 deletions src/semra/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,9 @@ def get_highest_exact_matches(self, limit: int = 10) -> Counter:
query = "MATCH (a)-[:`skos:exactMatch`]-(b) WHERE a.priority RETURN a.curie, count(distinct b) as c ORDER BY c DESCENDING LIMIT $limit"
return Counter(dict(self.read_query(query, limit=limit)))

def get_exact_matches(self, curie: str) -> set[Reference]:
def get_exact_matches(self, curie: str) -> dict[Reference, str]:
query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b"
return {Reference.from_curie(node["curie"]) for node, in self.read_query(query, curie=curie)}
return {Reference.from_curie(node["curie"]): node["name"] for node, in self.read_query(query, curie=curie)}

def get_concept_name(self, curie: str) -> str:
return _get_name_by_curie(curie)
Expand Down
44 changes: 35 additions & 9 deletions src/semra/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pickle
from pathlib import Path
from textwrap import dedent
from typing import TextIO, cast
from typing import Literal, TextIO, cast

import bioontologies
import bioregistry
Expand All @@ -14,9 +14,9 @@
import pandas as pd
import pyobo
import pyobo.utils
from bioregistry import Collection
from tqdm.auto import tqdm

from bioregistry import Collection
from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING
from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence

Expand Down Expand Up @@ -411,10 +411,14 @@ def from_pickle(path: str | Path) -> list[Mapping]:


def _edge_key(t):
s, p, o, c = t
s, p, o, c, *_ = t
return s, p, o, 1 if isinstance(c, float) else 0, t


def _neo4j_bool(b: bool, /) -> Literal["true", "false"]: # noqa:FBT001
return "true" if b else "false"


def write_neo4j(
mappings: list[Mapping],
directory: str | Path,
Expand All @@ -438,7 +442,7 @@ def write_neo4j(
equivalence_classes = {}

mapping_nodes_path = directory.joinpath("mapping_nodes.tsv")
mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence"]
mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence", "hasPrimary:boolean"]

evidence_nodes_path = directory.joinpath("evidence_nodes.tsv")
evidences = {}
Expand All @@ -453,11 +457,25 @@ def write_neo4j(

mapping_set_nodes_path = directory.joinpath("mapping_set_nodes.tsv")
mapping_sets = {}
mapping_set_nodes_header = ["curie:ID", ":LABEL", "prefix", "name", "license", "version", "confidence:float"]
mapping_set_nodes_header = [
"curie:ID",
":LABEL",
"prefix",
"name",
"license",
"version",
"confidence:float",
]

edges_path = directory.joinpath("edges.tsv")
edges: list[tuple[str, str, str, str | float]] = []
edges_header = [":START_ID", ":TYPE", ":END_ID", "confidence:float"]
edges_header = [
":START_ID",
":TYPE",
":END_ID",
"confidence:float",
"hasPrimary:boolean",
]

for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"):
concepts.add(mapping.s)
Expand All @@ -468,6 +486,7 @@ def write_neo4j(
mapping.p.curie,
mapping.o.curie,
round(c, 4) if (c := mapping.confidence) is not None else "",
_neo4j_bool(mapping.has_primary_evidence),
)
)
edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, ""))
Expand Down Expand Up @@ -500,7 +519,7 @@ def write_neo4j(
"concept",
concept.prefix,
pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "",
"true" if equivalence_classes.get(concept, False) else "false",
_neo4j_bool(equivalence_classes.get(concept, False)),
)
for concept in sorted(concepts, key=lambda n: n.curie)
),
Expand All @@ -509,7 +528,14 @@ def write_neo4j(
mapping_nodes_path,
mapping_nodes_header,
(
(mapping.curie, "mapping", "semra.mapping", mapping.p.curie, mapping.confidence)
(
mapping.curie,
"mapping",
"semra.mapping",
mapping.p.curie,
mapping.confidence and round(mapping.confidence, 4),
_neo4j_bool(mapping.has_primary_evidence),
)
for mapping in sorted(mappings, key=lambda n: n.curie)
),
)
Expand Down Expand Up @@ -577,7 +603,7 @@ def write_neo4j(
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
ARG twiddle1=dee
RUN python3.11 -m pip install git+https://github.com/biopragmatics/semra.git#egg=semra[web]
RUN python3.11 -m pip install "semra[web] @ git+https://github.com/biopragmatics/semra.git"
# Add graph content
ARG twiddle2=dee
Expand Down
5 changes: 5 additions & 0 deletions src/semra/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,11 @@ def get_reference(self) -> Reference:
def curie(self) -> str:
return self.get_reference().curie

@property
def has_primary_evidence(self) -> bool:
"""Get if there is a primary evidence associated with this mapping."""
return any(isinstance(evidence, SimpleEvidence) for evidence in self.evidence)


def line(*references: Reference) -> list[Mapping]:
"""Create a list of mappings from a simple mappings path."""
Expand Down
7 changes: 5 additions & 2 deletions src/semra/templates/concept.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,20 @@
<div class="card">
<div class="card-body">
<h5 class="card-title">
<a href="https://bioregistry.io/{{ curie }}">{{ curie }}</a>
{{ name }} <a class="badge badge-info" href="https://bioregistry.io/{{ curie }}">{{ curie }}</a>
</h5>
<h6>Exact Matches</h6>
</div>
<table class="table table-striped table-borderless">
<tbody>
{% for exact_match in exact_matches | sort(attribute='curie') %}
{% for exact_match, name in exact_matches.items() %}
<tr>
<td>
<code>{{ exact_match.curie }}</code>
</td>
<td>
{{ name }}
</td>
<td>
<a href="{{ url_for('view_concept', curie=exact_match.curie) }}">SeMRA</a>
</td>
Expand Down
19 changes: 15 additions & 4 deletions src/semra/wsgi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import os

import biomappings.resources
import biomappings.utils
import fastapi
import flask
from curies import Reference
Expand All @@ -15,6 +13,11 @@
from semra import Evidence, Mapping, MappingSet
from semra.client import Neo4jClient

try:
import biomappings.utils as biomappings_utils
except ImportError:
biomappings_utils = None

client = Neo4jClient()

api_router = fastapi.APIRouter()
Expand All @@ -29,7 +32,7 @@
api_router.mount("/", WSGIMiddleware(flask_app))

EXAMPLE_MAPPINGS = ["25b67912bc720127a43a06ce4688b672", "5a56bf7ac409d8de84c3382a99e17715"]
BIOMAPPINGS_GIT_HASH = biomappings.utils.get_git_hash()
BIOMAPPINGS_GIT_HASH = biomappings_utils is not None and biomappings_utils.get_git_hash()

PREDICATE_COUNTER = client.summarize_predicates()
MAPPING_SET_COUNTER = client.summarize_mapping_sets()
Expand Down Expand Up @@ -93,11 +96,17 @@ def view_mapping(curie: str):
def view_concept(curie: str):
"""View a concept."""
reference = Reference.from_curie(curie)
name = client.get_concept_name(curie)
exact_matches = client.get_exact_matches(curie)
# TODO when showing equivalence between two entities from same namespace, suggest curating a replaced by relation

return render_template(
"concept.html", reference=reference, curie=curie, exact_matches=exact_matches, has_biomappings=BIOMAPPINGS_GIT_HASH is not None
"concept.html",
reference=reference,
curie=curie,
name=name,
exact_matches=exact_matches,
has_biomappings=BIOMAPPINGS_GIT_HASH is not None,
)


Expand All @@ -110,6 +119,8 @@ def mark_exact_incorrect(source: str, target: str):
flask.flash("Can't interact with biomappings", category="error")
return flask.redirect(flask.url_for(view_concept.__name__, curie=source))

import biomappings.resources

source_reference = Reference.from_curie(source)
target_reference = Reference.from_curie(target)

Expand Down

0 comments on commit 3344440

Please sign in to comment.