Update web app

- Add several additional summaries - Improve label adding - Improve equivalency class checking
biopragmatics · Oct 16, 2023 · 3b9bb6c · 3b9bb6c
1 parent a76da7d
commit 3b9bb6c
Show file tree

Hide file tree

Showing 9 changed files with 244 additions and 78 deletions.
diff --git a/scripts/cancer_cell_reproduction.py b/scripts/cancer_cell_reproduction.py
@@ -69,7 +69,7 @@
         ),
         Input(prefix="ccle", source="pyobo", confidence=0.99, extras={"version": "2019"}),
     ],
-    # add_labels=True,
+    add_labels=True,
     priority=PRIORITY,
     keep_prefixes=PREFIXES,
     remove_imprecise=False,

diff --git a/src/semra/__init__.py b/src/semra/__init__.py
@@ -1,4 +1,4 @@
-from semra.rules import EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, UNSPECIFIED_MAPPING
+from semra.rules import DB_XREF, EXACT_MATCH, LEXICAL_MAPPING, MANUAL_MAPPING, UNSPECIFIED_MAPPING, REPLACED_BY
 from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
 
 __all__ = [
@@ -10,6 +10,8 @@
     "MappingSet",
     # Mapping predicates
     "EXACT_MATCH",
+    "DB_XREF",
+    "REPLACED_BY",
     # Mapping justifications
     "LEXICAL_MAPPING",
     "MANUAL_MAPPING",

diff --git a/src/semra/client.py b/src/semra/client.py
@@ -11,6 +11,7 @@
 
 import semra
 from semra import Evidence, MappingSet, Reference
+from semra.io import _get_name_by_curie
 
 __all__ = [
     "Node",
@@ -198,7 +199,6 @@ def summarize_mapping_sets(self) -> Counter:
         return Counter(dict(self.read_query(query)))
 
     def summarize_nodes(self) -> Counter:
-        # TODO count number of "equivalence classes"
         query = """\
         MATCH (n:evidence)   WITH count(n) as count RETURN 'Evidences'    as label, count UNION ALL
         MATCH (n:concept)    WITH count(n) as count RETURN 'References'     as label, count UNION ALL
@@ -208,6 +208,25 @@ def summarize_nodes(self) -> Counter:
         """
         return Counter(dict(self.read_query(query)))
 
+    def summarize_concepts(self) -> Counter:
+        query = "MATCH (e:concept) WHERE e.prefix <> 'orcid' RETURN e.prefix, count(e.prefix)"
+        return Counter(dict(self.read_query(query)))
+
+    def summarize_authors(self) -> Counter:
+        query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, count(e)"
+        return Counter(dict(self.read_query(query)))
+
+    def get_highest_exact_matches(self, limit: int = 10) -> Counter:
+        query = "MATCH (a)-[:`skos:exactMatch`]-(b) WHERE a.priority RETURN a.curie, count(distinct b) as c ORDER BY c DESCENDING LIMIT $limit"
+        return Counter(dict(self.read_query(query, limit=limit)))
+
+    def get_exact_matches(self, curie: str) -> set[Reference]:
+        query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b"
+        return {Reference.from_curie(node["curie"]) for node, in self.read_query(query, curie=curie)}
+
+    def get_concept_name(self, curie: str) -> str:
+        return _get_name_by_curie(curie)
+
 
 # Follows example here:
 # https://neo4j.com/docs/python-manual/current/session-api/#python-driver-simple-transaction-fn

diff --git a/src/semra/io.py b/src/semra/io.py
@@ -5,7 +5,7 @@
 import pickle
 from pathlib import Path
 from textwrap import dedent
-from typing import TextIO
+from typing import TextIO, cast
 
 import bioontologies
 import bioregistry
@@ -16,6 +16,7 @@
 import pyobo.utils
 from tqdm.auto import tqdm
 
+from bioregistry import Collection
 from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING
 from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
 
@@ -341,13 +342,12 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat
 
 SKIP_PREFIXES = {
     "pubchem",
+    "pubchem.compound",
+    "pubchem.substance",
     "kegg",
     "snomedct",
-    "icd9",
-    "icd10",
-    "icd11",
-    "icd",
 }
+SKIP_PREFIXES.update(cast(Collection, bioregistry.get_collection("0000004")).resources)
 
 
 def _get_name_by_curie(curie: str) -> str | None:
@@ -418,8 +418,10 @@ def _edge_key(t):
 def write_neo4j(
     mappings: list[Mapping],
     directory: str | Path,
+    *,
     docker_name: str | None = None,
-    priority_references: set[Reference] | None = None,
+    equivalence_classes: dict[Reference, bool] | None = None,
+    add_labels: bool = False,
 ) -> None:
     directory = Path(directory).resolve()
     if not directory.is_dir():
@@ -431,9 +433,9 @@ def write_neo4j(
 
     concept_nodes_path = directory.joinpath("concept_nodes.tsv")
     concepts: set[Reference] = set()
-    if priority_references is None:
-        priority_references = set()
-    concept_nodes_header = ["curie:ID", ":LABEL", "prefix", "priority:boolean"]
+    concept_nodes_header = ["curie:ID", ":LABEL", "prefix", "name", "priority:boolean"]
+    if equivalence_classes is None:
+        equivalence_classes = {}
 
     mapping_nodes_path = directory.joinpath("mapping_nodes.tsv")
     mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence"]
@@ -493,7 +495,13 @@ def write_neo4j(
         concept_nodes_path,
         concept_nodes_header,
         (
-            (concept.curie, "concept", concept.prefix, "true" if concept in priority_references else "false")
+            (
+                concept.curie,
+                "concept",
+                concept.prefix,
+                pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "",
+                "true" if equivalence_classes.get(concept, False) else "false",
+            )
             for concept in sorted(concepts, key=lambda n: n.curie)
         ),
     )

diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py
@@ -111,9 +111,7 @@ class Configuration(BaseModel):
     priority_sssom_path: Path | None = None
     # note that making a priority neo4j doesn't make sense
 
-    sssom_add_labels: bool = Field(
-        default=False, description="Should PyOBO be used to look up labels for SSSOM output?"
-    )
+    add_labels: bool = Field(default=False, description="Should PyOBO be used to look up labels for SSSOM output?")
 
     @root_validator(skip_on_failure=True)
     def infer_priority(cls, values):  # noqa:N805
@@ -153,9 +151,14 @@ def get_mappings_from_config(
         if configuration.raw_pickle_path:
             write_pickle(raw_mappings, configuration.raw_pickle_path)
         if configuration.raw_sssom_path:
-            write_sssom(raw_mappings, configuration.raw_sssom_path, add_labels=configuration.sssom_add_labels)
+            write_sssom(raw_mappings, configuration.raw_sssom_path, add_labels=configuration.add_labels)
         if configuration.raw_neo4j_path:
-            write_neo4j(raw_mappings, configuration.raw_neo4j_path, configuration.raw_neo4j_name)
+            write_neo4j(
+                raw_mappings,
+                configuration.raw_neo4j_path,
+                docker_name=configuration.raw_neo4j_name,
+                add_labels=configuration.add_labels,
+            )
 
     # click.echo(semra.api.str_source_target_counts(mappings, minimum=20))
     processed_mappings = process(
@@ -172,24 +175,34 @@ def get_mappings_from_config(
     if configuration.processed_pickle_path:
         write_pickle(processed_mappings, configuration.processed_pickle_path)
     if configuration.processed_sssom_path:
-        write_sssom(processed_mappings, configuration.processed_sssom_path, add_labels=configuration.sssom_add_labels)
+        write_sssom(processed_mappings, configuration.processed_sssom_path, add_labels=configuration.add_labels)
     if configuration.processed_neo4j_path:
-        priority_references = {mapping.o for mapping in prioritized_mappings}
+        equivalence_classes = _get_equivalence_classes(processed_mappings, prioritized_mappings)
         write_neo4j(
             processed_mappings,
             configuration.processed_neo4j_path,
-            configuration.processed_neo4j_name,
-            priority_references=priority_references,
+            docker_name=configuration.processed_neo4j_name,
+            equivalence_classes=equivalence_classes,
+            add_labels=configuration.add_labels,
         )
 
     if configuration.priority_pickle_path:
         write_pickle(prioritized_mappings, configuration.priority_pickle_path)
     if configuration.priority_sssom_path:
-        write_sssom(prioritized_mappings, configuration.priority_sssom_path, add_labels=configuration.sssom_add_labels)
+        write_sssom(prioritized_mappings, configuration.priority_sssom_path, add_labels=configuration.add_labels)
 
     return prioritized_mappings
 
 
+def _get_equivalence_classes(mappings, prioritized_mappings) -> dict[Reference, bool]:
+    priority_references = {mapping.o for mapping in prioritized_mappings}
+    rv = {}
+    for mapping in mappings:
+        rv[mapping.s] = mapping.s in priority_references
+        rv[mapping.o] = mapping.o in priority_references
+    return rv
+
+
 def get_raw_mappings(configuration: Configuration) -> list[Mapping]:
     """Get raw mappings based on the inputs in a configuration."""
     mappings = []
@@ -265,11 +278,12 @@ def process(
     # logger.debug(f"Filtered to {len(mappings):,} mappings")
 
     # remove mapping between self, such as EFO-EFO
-    logger.info("Removing self mappings (i.e., within a given semantic space)")
-    before = len(mappings)
-    start = time.time()
-    mappings = filter_self_matches(mappings)
-    _log_diff(before, mappings, verb="Filtered source internal", elapsed=time.time() - start)
+    # TODO handle self-mappings better using "replaced by" relations
+    # logger.info("Removing self mappings (i.e., within a given semantic space)")
+    # before = len(mappings)
+    # start = time.time()
+    # mappings = filter_self_matches(mappings)
+    # _log_diff(before, mappings, verb="Filtered source internal", elapsed=time.time() - start)
 
     if upgrade_prefixes:
         logger.info("Inferring mapping upgrades")

diff --git a/src/semra/rules.py b/src/semra/rules.py
@@ -8,6 +8,7 @@
 CLOSE_MATCH = Reference(prefix="skos", identifier="closeMatch")
 DB_XREF = Reference(prefix="oboinowl", identifier="hasDbXref")
 EQUIVALENT_TO = Reference(prefix="owl", identifier="equivalentTo")
+REPLACED_BY = Reference(prefix="iao", identifier="0100001")
 
 IMPRECISE = {DB_XREF, CLOSE_MATCH}
 FLIP = {

diff --git a/src/semra/templates/concept.html b/src/semra/templates/concept.html
@@ -0,0 +1,52 @@
+{% extends "base.html" %}
+
+{% import "bootstrap5/utils.html" as util %}
+
+{% block title %}SeMRA{% endblock %}
+
+
+{% block content %}
+<div class="container" style="margin-top: 50px; margin-bottom: 50px">
+    {{ util.render_messages(dismissible=True, container=False) }}
+    <div class="row">
+        <div class="card">
+            <div class="card-body">
+                <h5 class="card-title">
+                    <a href="https://bioregistry.io/{{ curie }}">{{ curie }}</a>
+                </h5>
+                <h6>Exact Matches</h6>
+            </div>
+            <table class="table table-striped table-borderless">
+                <tbody>
+                {% for exact_match in exact_matches | sort(attribute='curie') %}
+                <tr>
+                    <td>
+                        <code>{{ exact_match.curie }}</code>
+                    </td>
+                    <td>
+                        <a href="{{ url_for('view_concept', curie=exact_match.curie) }}">SeMRA</a>
+                    </td>
+                    <td>
+                        <a href="https://bioregistry.io/{{ exact_match.curie }}">Bioregistry</a>
+                    </td>
+                    {% if has_biomappings %}
+                    <td>
+                        {% if reference.prefix == exact_match.prefix %}
+
+                        Handle in-prefix mapping
+
+                        {% else %}
+                        <a href="{{ url_for('mark_exact_incorrect', source=curie, target=exact_match.curie) }}">
+                            Mark as Incorrect
+                        </a>
+                        {% endif %}
+                    </td>
+                    {% endif %}
+                </tr>
+                {% endfor %}
+                </tbody>
+            </table>
+        </div>
+    </div>
+</div>
+{% endblock %}