Skip to content

Commit

Permalink
Update web app
Browse files Browse the repository at this point in the history
- Add several additional summaries
- Improve label adding
- Improve equivalency class checking
  • Loading branch information
cthoyt committed Oct 16, 2023
1 parent a76da7d commit 3b9bb6c
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 78 deletions.
2 changes: 1 addition & 1 deletion scripts/cancer_cell_reproduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
),
Input(prefix="ccle", source="pyobo", confidence=0.99, extras={"version": "2019"}),
],
# add_labels=True,
add_labels=True,
priority=PRIORITY,
keep_prefixes=PREFIXES,
remove_imprecise=False,
Expand Down
4 changes: 3 additions & 1 deletion src/semra/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from semra.rules import EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, UNSPECIFIED_MAPPING
from semra.rules import DB_XREF, EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, UNSPECIFIED_MAPPING, REPLACED_BY
from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence

__all__ = [
Expand All @@ -10,6 +10,8 @@
"MappingSet",
# Mapping predicates
"EXACT_MATCH",
"DB_XREF",
"REPLACED_BY",
# Mapping justifications
"LEXICAL_MAPPING",
"MANUAL_MAPPING",
Expand Down
21 changes: 20 additions & 1 deletion src/semra/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import semra
from semra import Evidence, MappingSet, Reference
from semra.io import _get_name_by_curie

__all__ = [
"Node",
Expand Down Expand Up @@ -198,7 +199,6 @@ def summarize_mapping_sets(self) -> Counter:
return Counter(dict(self.read_query(query)))

def summarize_nodes(self) -> Counter:
# TODO count number of "equivalence classes"
query = """\
MATCH (n:evidence) WITH count(n) as count RETURN 'Evidences' as label, count UNION ALL
MATCH (n:concept) WITH count(n) as count RETURN 'References' as label, count UNION ALL
Expand All @@ -208,6 +208,25 @@ def summarize_nodes(self) -> Counter:
"""
return Counter(dict(self.read_query(query)))

def summarize_concepts(self) -> Counter:
query = "MATCH (e:concept) WHERE e.prefix <> 'orcid' RETURN e.prefix, count(e.prefix)"
return Counter(dict(self.read_query(query)))

def summarize_authors(self) -> Counter:
query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, count(e)"
return Counter(dict(self.read_query(query)))

def get_highest_exact_matches(self, limit: int = 10) -> Counter:
query = "MATCH (a)-[:`skos:exactMatch`]-(b) WHERE a.priority RETURN a.curie, count(distinct b) as c ORDER BY c DESCENDING LIMIT $limit"
return Counter(dict(self.read_query(query, limit=limit)))

def get_exact_matches(self, curie: str) -> set[Reference]:
query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b"
return {Reference.from_curie(node["curie"]) for node, in self.read_query(query, curie=curie)}

def get_concept_name(self, curie: str) -> str:
return _get_name_by_curie(curie)


# Follows example here:
# https://neo4j.com/docs/python-manual/current/session-api/#python-driver-simple-transaction-fn
Expand Down
28 changes: 18 additions & 10 deletions src/semra/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pickle
from pathlib import Path
from textwrap import dedent
from typing import TextIO
from typing import TextIO, cast

import bioontologies
import bioregistry
Expand All @@ -16,6 +16,7 @@
import pyobo.utils
from tqdm.auto import tqdm

from bioregistry import Collection
from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING
from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence

Expand Down Expand Up @@ -341,13 +342,12 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat

SKIP_PREFIXES = {
"pubchem",
"pubchem.compound",
"pubchem.substance",
"kegg",
"snomedct",
"icd9",
"icd10",
"icd11",
"icd",
}
SKIP_PREFIXES.update(cast(Collection, bioregistry.get_collection("0000004")).resources)


def _get_name_by_curie(curie: str) -> str | None:
Expand Down Expand Up @@ -418,8 +418,10 @@ def _edge_key(t):
def write_neo4j(
mappings: list[Mapping],
directory: str | Path,
*,
docker_name: str | None = None,
priority_references: set[Reference] | None = None,
equivalence_classes: dict[Reference, bool] | None = None,
add_labels: bool = False,
) -> None:
directory = Path(directory).resolve()
if not directory.is_dir():
Expand All @@ -431,9 +433,9 @@ def write_neo4j(

concept_nodes_path = directory.joinpath("concept_nodes.tsv")
concepts: set[Reference] = set()
if priority_references is None:
priority_references = set()
concept_nodes_header = ["curie:ID", ":LABEL", "prefix", "priority:boolean"]
concept_nodes_header = ["curie:ID", ":LABEL", "prefix", "name", "priority:boolean"]
if equivalence_classes is None:
equivalence_classes = {}

mapping_nodes_path = directory.joinpath("mapping_nodes.tsv")
mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence"]
Expand Down Expand Up @@ -493,7 +495,13 @@ def write_neo4j(
concept_nodes_path,
concept_nodes_header,
(
(concept.curie, "concept", concept.prefix, "true" if concept in priority_references else "false")
(
concept.curie,
"concept",
concept.prefix,
pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "",
"true" if equivalence_classes.get(concept, False) else "false",
)
for concept in sorted(concepts, key=lambda n: n.curie)
),
)
Expand Down
44 changes: 29 additions & 15 deletions src/semra/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,7 @@ class Configuration(BaseModel):
priority_sssom_path: Path | None = None
# note that making a priority neo4j doesn't make sense

sssom_add_labels: bool = Field(
default=False, description="Should PyOBO be used to look up labels for SSSOM output?"
)
add_labels: bool = Field(default=False, description="Should PyOBO be used to look up labels for SSSOM output?")

@root_validator(skip_on_failure=True)
def infer_priority(cls, values): # noqa:N805
Expand Down Expand Up @@ -153,9 +151,14 @@ def get_mappings_from_config(
if configuration.raw_pickle_path:
write_pickle(raw_mappings, configuration.raw_pickle_path)
if configuration.raw_sssom_path:
write_sssom(raw_mappings, configuration.raw_sssom_path, add_labels=configuration.sssom_add_labels)
write_sssom(raw_mappings, configuration.raw_sssom_path, add_labels=configuration.add_labels)
if configuration.raw_neo4j_path:
write_neo4j(raw_mappings, configuration.raw_neo4j_path, configuration.raw_neo4j_name)
write_neo4j(
raw_mappings,
configuration.raw_neo4j_path,
docker_name=configuration.raw_neo4j_name,
add_labels=configuration.add_labels,
)

# click.echo(semra.api.str_source_target_counts(mappings, minimum=20))
processed_mappings = process(
Expand All @@ -172,24 +175,34 @@ def get_mappings_from_config(
if configuration.processed_pickle_path:
write_pickle(processed_mappings, configuration.processed_pickle_path)
if configuration.processed_sssom_path:
write_sssom(processed_mappings, configuration.processed_sssom_path, add_labels=configuration.sssom_add_labels)
write_sssom(processed_mappings, configuration.processed_sssom_path, add_labels=configuration.add_labels)
if configuration.processed_neo4j_path:
priority_references = {mapping.o for mapping in prioritized_mappings}
equivalence_classes = _get_equivalence_classes(processed_mappings, prioritized_mappings)
write_neo4j(
processed_mappings,
configuration.processed_neo4j_path,
configuration.processed_neo4j_name,
priority_references=priority_references,
docker_name=configuration.processed_neo4j_name,
equivalence_classes=equivalence_classes,
add_labels=configuration.add_labels,
)

if configuration.priority_pickle_path:
write_pickle(prioritized_mappings, configuration.priority_pickle_path)
if configuration.priority_sssom_path:
write_sssom(prioritized_mappings, configuration.priority_sssom_path, add_labels=configuration.sssom_add_labels)
write_sssom(prioritized_mappings, configuration.priority_sssom_path, add_labels=configuration.add_labels)

return prioritized_mappings


def _get_equivalence_classes(mappings, prioritized_mappings) -> dict[Reference, bool]:
priority_references = {mapping.o for mapping in prioritized_mappings}
rv = {}
for mapping in mappings:
rv[mapping.s] = mapping.s in priority_references
rv[mapping.o] = mapping.o in priority_references
return rv


def get_raw_mappings(configuration: Configuration) -> list[Mapping]:
"""Get raw mappings based on the inputs in a configuration."""
mappings = []
Expand Down Expand Up @@ -265,11 +278,12 @@ def process(
# logger.debug(f"Filtered to {len(mappings):,} mappings")

# remove mapping between self, such as EFO-EFO
logger.info("Removing self mappings (i.e., within a given semantic space)")
before = len(mappings)
start = time.time()
mappings = filter_self_matches(mappings)
_log_diff(before, mappings, verb="Filtered source internal", elapsed=time.time() - start)
# TODO handle self-mappings better using "replaced by" relations
# logger.info("Removing self mappings (i.e., within a given semantic space)")
# before = len(mappings)
# start = time.time()
# mappings = filter_self_matches(mappings)
# _log_diff(before, mappings, verb="Filtered source internal", elapsed=time.time() - start)

if upgrade_prefixes:
logger.info("Inferring mapping upgrades")
Expand Down
1 change: 1 addition & 0 deletions src/semra/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
CLOSE_MATCH = Reference(prefix="skos", identifier="closeMatch")
DB_XREF = Reference(prefix="oboinowl", identifier="hasDbXref")
EQUIVALENT_TO = Reference(prefix="owl", identifier="equivalentTo")
REPLACED_BY = Reference(prefix="iao", identifier="0100001")

IMPRECISE = {DB_XREF, CLOSE_MATCH}
FLIP = {
Expand Down
52 changes: 52 additions & 0 deletions src/semra/templates/concept.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{% extends "base.html" %}

{% import "bootstrap5/utils.html" as util %}

{% block title %}SeMRA{% endblock %}


{% block content %}
<div class="container" style="margin-top: 50px; margin-bottom: 50px">
{{ util.render_messages(dismissible=True, container=False) }}
<div class="row">
<div class="card">
<div class="card-body">
<h5 class="card-title">
<a href="https://bioregistry.io/{{ curie }}">{{ curie }}</a>
</h5>
<h6>Exact Matches</h6>
</div>
<table class="table table-striped table-borderless">
<tbody>
{% for exact_match in exact_matches | sort(attribute='curie') %}
<tr>
<td>
<code>{{ exact_match.curie }}</code>
</td>
<td>
<a href="{{ url_for('view_concept', curie=exact_match.curie) }}">SeMRA</a>
</td>
<td>
<a href="https://bioregistry.io/{{ exact_match.curie }}">Bioregistry</a>
</td>
{% if has_biomappings %}
<td>
{% if reference.prefix == exact_match.prefix %}

Handle in-prefix mapping

{% else %}
<a href="{{ url_for('mark_exact_incorrect', source=curie, target=exact_match.curie) }}">
Mark as Incorrect
</a>
{% endif %}
</td>
{% endif %}
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
{% endblock %}
Loading

0 comments on commit 3b9bb6c

Please sign in to comment.