Update concept graph and home page

biopragmatics · Oct 17, 2023 · e169041 · e169041
1 parent 137752c
commit e169041
Show file tree

Hide file tree

Showing 6 changed files with 114 additions and 51 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
   "tqdm",
   "more_itertools",
   "networkx",
+  "bioregistry",
   "bioontologies",
   "pyobo",
 ]

diff --git a/src/semra/client.py b/src/semra/client.py
@@ -11,6 +11,7 @@
 import pydantic
 from neo4j import Transaction, unit_of_work
 
+import bioregistry
 import semra
 from semra import Evidence, MappingSet, Reference
 from semra.io import _get_name_by_curie
@@ -181,55 +182,65 @@ def get_evidence(self, curie: str) -> Evidence:
         res = self.read_query(query, curie=curie)
         return res[0][0]
 
-    def summarize_predicates(self) -> Counter:
+    def summarize_predicates(self) -> t.Counter[str]:
         """Get a counter of predicates."""
         query = "MATCH (m:mapping) RETURN m.predicate, count(m.predicate)"
         return Counter(dict(self.read_query(query)))
 
-    def summarize_justifications(self) -> Counter:
+    def summarize_justifications(self) -> t.Counter[str]:
         """Get a counter of mapping justifications."""
         query = "MATCH (e:evidence) RETURN e.mapping_justification, count(e.mapping_justification)"
         return Counter({k.removeprefix("semapv:"): v for k, v in self.read_query(query)})
 
-    def summarize_evidence_types(self) -> Counter:
+    def summarize_evidence_types(self) -> t.Counter[str]:
         query = "MATCH (e:evidence) RETURN e.type, count(e.type)"
         return Counter(dict(self.read_query(query)))
 
-    def summarize_mapping_sets(self) -> Counter:
+    def summarize_mapping_sets(self) -> t.Counter[str]:
         """Get the number of evidences in each mapping set."""
         query = "MATCH (e:evidence)-[:fromSet]->(s:mappingset) RETURN s.curie, count(e)"
         return Counter(dict(self.read_query(query)))
 
-    def summarize_nodes(self) -> Counter:
+    def summarize_nodes(self) -> t.Counter[str]:
         query = """\
         MATCH (n:evidence)   WITH count(n) as count RETURN 'Evidences'    as label, count UNION ALL
-        MATCH (n:concept)    WITH count(n) as count RETURN 'References'     as label, count UNION ALL
-        MATCH (n:concept)    WHERE n.priority WITH count(n) as count RETURN 'Concepts'     as label, count UNION ALL
+        MATCH (n:concept)    WITH count(n) as count RETURN 'Concepts'     as label, count UNION ALL
+        MATCH (n:concept)    WHERE n.priority WITH count(n) as count RETURN 'Equivalence Classes'     as label, count UNION ALL
         MATCH (n:mapping)    WITH count(n) as count RETURN 'Mappings'     as label, count UNION ALL
         MATCH (n:mappingset) WITH count(n) as count RETURN 'Mapping Sets' as label, count
         """
         return Counter(dict(self.read_query(query)))
 
-    def summarize_concepts(self) -> Counter:
+    def summarize_concepts(self) -> t.Counter[tuple[str, str]]:
         query = "MATCH (e:concept) WHERE e.prefix <> 'orcid' RETURN e.prefix, count(e.prefix)"
-        return Counter(dict(self.read_query(query)))
+        return Counter({
+            (prefix, bioregistry.get_name(prefix)): count
+            for prefix, count in self.read_query(query)
+        })
 
-    def summarize_authors(self) -> Counter:
-        query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, count(e)"
-        return Counter(dict(self.read_query(query)))
+    def summarize_authors(self) -> t.Counter[tuple[str, str]]:
+        query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, a.name, count(e)"
+        return self._count_with_name(query)
 
-    def get_highest_exact_matches(self, limit: int = 10) -> Counter:
+    def get_highest_exact_matches(self, limit: int = 10) -> t.Counter[tuple[str, str]]:
         query = """\
             MATCH (a)-[:`skos:exactMatch`]-(b)
-            WHERE a.priority RETURN a.curie, count(distinct b) as c
+            WHERE a.priority 
+            RETURN a.curie, a.name, count(distinct b) as c
             ORDER BY c DESCENDING
             LIMIT $limit
         """
-        return Counter(dict(self.read_query(query, limit=limit)))
+        return self._count_with_name(query, limit=limit)
+
+    def _count_with_name(self, query: str, **kwargs: Any) -> t.Counter[tuple[str, str]]:
+        return Counter({
+            (curie, name): count
+            for curie, name, count in self.read_query(query, **kwargs)
+        })
 
     def get_exact_matches(self, curie: str) -> dict[Reference, str]:
-        query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b"
-        return {Reference.from_curie(node["curie"]): node["name"] for node, in self.read_query(query, curie=curie)}
+        query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN a.curie, a.name"
+        return {Reference.from_curie(n_curie): name for n_curie, name in self.read_query(query, curie=curie)}
 
     def get_connected_component(self, curie: str) -> tuple[list[neo4j.graph.Node], list[neo4j.graph.Relationship]]:
         query = """\

diff --git a/src/semra/io.py b/src/semra/io.py
@@ -15,7 +15,7 @@
 import pyobo
 import pyobo.utils
 from bioregistry import Collection
-from tqdm.auto import tqdm
+from tqdm import tqdm
 
 from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING
 from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
@@ -36,6 +36,12 @@
 logger = logging.getLogger(__name__)
 
 
+CONFIDENCE_PRECISION = 5
+HAS_EVIDENCE_PREDICATE = "hasEvidence"
+FROM_SET_PREDICATE = "fromSet"
+DERIVED_PREDICATE = "derivedFromMapping"
+
+
 def _safe_get_version(prefix: str) -> str | None:
     try:
         return bioversions.get_version(prefix)
@@ -353,6 +359,12 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat
 def _get_name_by_curie(curie: str) -> str | None:
     if any(curie.startswith(p) for p in SKIP_PREFIXES):
         return None
+    if curie.startswith("orcid:"):
+        import requests
+
+        orcid = curie.removeprefix("orcid:")
+        res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}).json()
+        return res["person"]["name"]["given-names"]["value"] + " " + res["person"]["name"]["family-name"]["value"]
     return pyobo.get_name_by_curie(curie)
 
 
@@ -374,7 +386,7 @@ def _get_sssom_row(mapping: Mapping, e: Evidence):
         ",".join(sorted(e.mapping_set_names)),
         mapping_set_version,
         mapping_set_license,
-        round(confidence, 4) if (confidence := e.confidence) is not None else "",
+        round(confidence, CONFIDENCE_PRECISION) if (confidence := e.confidence) is not None else "",
         e.author.curie if e.author else "",
         e.explanation,
     )
@@ -416,7 +428,7 @@ def _edge_key(t):
 
 
 def _neo4j_bool(b: bool, /) -> Literal["true", "false"]:  # noqa:FBT001
-    return "true" if b else "false"
+    return "true" if b else "false"  # type:ignore
 
 
 def write_neo4j(
@@ -442,7 +454,16 @@ def write_neo4j(
         equivalence_classes = {}
 
     mapping_nodes_path = directory.joinpath("mapping_nodes.tsv")
-    mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence", "hasPrimary:boolean"]
+    mapping_nodes_header = [
+        "curie:ID",
+        ":LABEL",
+        "prefix",
+        "predicate",
+        "confidence",
+        "primary:boolean",
+        "secondary:boolean",
+        "tertiary:boolean",
+    ]
 
     evidence_nodes_path = directory.joinpath("evidence_nodes.tsv")
     evidences = {}
@@ -468,38 +489,46 @@ def write_neo4j(
     ]
 
     edges_path = directory.joinpath("edges.tsv")
-    edges: list[tuple[str, str, str, str | float]] = []
+    edges: list[tuple[str, str, str, str | float, str, str, str, str]] = []
     edges_header = [
         ":START_ID",
         ":TYPE",
         ":END_ID",
         "confidence:float",
-        "hasPrimary:boolean",
+        "primary:boolean",
+        "secondary:boolean",
+        "tertiary:boolean",
+        "mapping_sets:string[]",
     ]
 
     for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"):
+        mapping: Mapping
         concepts.add(mapping.s)
         concepts.add(mapping.o)
+
         edges.append(
             (
                 mapping.s.curie,
                 mapping.p.curie,
                 mapping.o.curie,
-                round(c, 4) if (c := mapping.confidence) is not None else "",
-                _neo4j_bool(mapping.has_primary_evidence),
+                round(c, CONFIDENCE_PRECISION) if (c := mapping.confidence) is not None else "",
+                _neo4j_bool(mapping.has_primary),
+                _neo4j_bool(mapping.has_secondary),
+                _neo4j_bool(mapping.has_tertiary),
+                "|".join(sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set})),
             )
         )
-        edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, ""))
-        edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, ""))
+        edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "", "", "", "", ""))
+        edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, "", "", "", "", ""))
         for evidence in mapping.evidence:
-            edges.append((mapping.curie, "hasEvidence", evidence.curie, ""))
+            edges.append((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie, "", "", "", "", ""))
             evidences[evidence.key()] = evidence
             if evidence.mapping_set:
                 mapping_sets[evidence.mapping_set.name] = evidence.mapping_set
-                edges.append((evidence.curie, "fromSet", evidence.mapping_set.curie, ""))
+                edges.append((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie, "", "", "", "", ""))
             elif isinstance(evidence, ReasonedEvidence):
                 for mmm in evidence.mappings:
-                    edges.append((evidence.curie, "derivedFromMapping", mmm.curie, ""))
+                    edges.append((evidence.curie, DERIVED_PREDICATE, mmm.curie, "", "", "", "", ""))
             elif isinstance(evidence, SimpleEvidence):
                 pass
             else:
@@ -508,7 +537,7 @@ def write_neo4j(
             # Add authorship information for the evidence, if available
             if evidence.author:
                 concepts.add(evidence.author)
-                edges.append((evidence.curie, "hasAuthor", evidence.author.curie, ""))
+                edges.append((evidence.curie, "hasAuthor", evidence.author.curie, "", "", "", "", ""))
 
     _write_tsv(
         concept_nodes_path,
@@ -518,7 +547,7 @@ def write_neo4j(
                 concept.curie,
                 "concept",
                 concept.prefix,
-                pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "",
+                _get_name_by_curie(concept.curie) or "" if add_labels else "",
                 _neo4j_bool(equivalence_classes.get(concept, False)),
             )
             for concept in sorted(concepts, key=lambda n: n.curie)
@@ -533,8 +562,10 @@ def write_neo4j(
                 "mapping",
                 "semra.mapping",
                 mapping.p.curie,
-                mapping.confidence and round(mapping.confidence, 4),
-                _neo4j_bool(mapping.has_primary_evidence),
+                mapping.confidence and round(mapping.confidence, CONFIDENCE_PRECISION),
+                _neo4j_bool(mapping.has_primary),
+                _neo4j_bool(mapping.has_secondary),
+                _neo4j_bool(mapping.has_tertiary),
             )
             for mapping in sorted(mappings, key=lambda n: n.curie)
         ),

diff --git a/src/semra/struct.py b/src/semra/struct.py
@@ -226,9 +226,25 @@ def curie(self) -> str:
         return self.get_reference().curie
 
     @property
-    def has_primary_evidence(self) -> bool:
+    def has_primary(self) -> bool:
         """Get if there is a primary evidence associated with this mapping."""
-        return any(isinstance(evidence, SimpleEvidence) for evidence in self.evidence)
+        return any(
+            isinstance(evidence, SimpleEvidence) and evidence.mapping_set.name == self.s.prefix
+            for evidence in self.evidence
+        )
+
+    @property
+    def has_secondary(self) -> bool:
+        """Get if there is a secondary evidence associated with this mapping."""
+        return any(
+            isinstance(evidence, SimpleEvidence) and evidence.mapping_set.name != self.s.prefix
+            for evidence in self.evidence
+        )
+
+    @property
+    def has_tertiary(self) -> bool:
+        """Get if there are any tertiary (i.e., reasoned) evidences for this mapping."""
+        return any(not isinstance(evidence, SimpleEvidence) for evidence in self.evidence)
 
 
 def line(*references: Reference) -> list[Mapping]:

diff --git a/src/semra/templates/concept.html b/src/semra/templates/concept.html
@@ -15,6 +15,8 @@
 </style>
 <script src="https://code.jquery.com/jquery-3.1.1.min.js"></script>
 <script src="https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.26.0/cytoscape.min.js"></script>
+<script src="https://unpkg.com/webcola/WebCola/cola.min.js"></script>
+<script src="https://cytoscape.org/cytoscape.js-cola/cytoscape-cola.js"></script>
 {% endblock %}
 
 {% block scripts %}
@@ -52,24 +54,19 @@
                         "curve-style": "bezier",
                         'text-background-color': 'yellow',
                         'text-background-opacity': 0.4,
-                        'width': '4px',
+                        'width': '2px',
                         'target-arrow-shape': 'triangle',
                         'control-point-step-size': '140px',
                         autorotate: true
                     }
                 }
             ],
             layout: {
-                name: 'cose',
-                animate: true,
-                padding: 100,
+                name: 'cola',
+                nodeSpacing: function( node ){ return 75; },
+                // nodeDimensionsIncludeLabels: true
             }
         });
-
-        var layout = cy.layout({ name: 'cose' });
-        layout.run();
-
-layout.run();
     });
 </script>
 {% endblock %}

diff --git a/src/semra/templates/home.html b/src/semra/templates/home.html
@@ -14,14 +14,15 @@ <h2 style="margin-bottom: 0">{{ count }}{{ suffix }}</h2>
 </div>
 {% endmacro %}
 
-{% macro do_table(counter, label, is_concept=False) %}
+{% macro do_table(counter, label, is_concept=False, has_names=False) %}
 <div class="card-body">
     <h6>{{ label }} Summary</h6>
 </div>
 <table class="table">
     <thead>
     <tr>
         <th>{{ label }}</th>
+        {% if has_names %}<th>Name</th>{% endif %}
         <th>Count</th>
     </tr>
     </thead>
@@ -30,11 +31,17 @@ <h6>{{ label }} Summary</h6>
     <tr>
         <td><code>
         {%- if is_concept -%}
+            {% if has_names %}
+            <a href="{{ url_for('view_concept', curie=key[0]) }}">{{ key[0] }}</a>
+            {% else %}
+            FAILURE!
             <a href="{{ url_for('view_concept', curie=key) }}">{{ key }}</a>
+            {% endif %}
             {%- else -%}
-            {{ key }}
+            {% if has_names %}{{ key[0] }}{% else %}{{ key }}{% endif %}
             {%- endif -%}
         </code></td>
+        {% if has_names %}<td>{{ key[1] }}</td>{% endif %}
         <td align="right">{{ "{:,}".format(count) }}</td>
     </tr>
     {% endfor %}
@@ -53,8 +60,8 @@ <h5 class="card-title">
                         Semantic Reasoning Assembler
                     </h5>
                     <div class="row text-center stats" style="padding-top: 1em; padding-bottom: 1em;">
-                        {{ count_column(node_counter, "References", "dna") }}
                         {{ count_column(node_counter, "Concepts", "dna") }}
+                        {{ count_column(node_counter, "Equivalence Classes", "dna") }}
                         {{ count_column(node_counter, "Mappings", "arrows-alt") }}
                         {{ count_column(node_counter, "Evidences", "book") }}
                         {{ count_column(node_counter, "Mapping Sets", "puzzle-piece") }}
@@ -94,9 +101,9 @@ <h6>Mapping Sets</h6>
                 {{ do_table(evidence_type_counter, "Evidence Type") }}
                 {{ do_table(predicate_counter, "Predicate") }}
                 {{ do_table(justification_counter, "Mapping Justification") }}
-                {{ do_table(prefix_counter, "Prefix") }}
-                {{ do_table(author_counter, "Author") }}
-                {{ do_table(high_matches_counter, "Potential Data Issues", is_concept=True) }}
+                {{ do_table(prefix_counter, "Prefix", has_names=True) }}
+                {{ do_table(author_counter, "Author", has_names=True) }}
+                {{ do_table(high_matches_counter, "Potential Data Issues", is_concept=True, has_names=True) }}
 
                 <div class="card-body">
                     <h6>Example Mappings</h6>