Skip to content

Commit

Permalink
Update concept graph and home page
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed Oct 17, 2023
1 parent 137752c commit e169041
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 51 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ dependencies = [
"tqdm",
"more_itertools",
"networkx",
"bioregistry",
"bioontologies",
"pyobo",
]
Expand Down
45 changes: 28 additions & 17 deletions src/semra/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import pydantic
from neo4j import Transaction, unit_of_work

import bioregistry
import semra
from semra import Evidence, MappingSet, Reference
from semra.io import _get_name_by_curie
Expand Down Expand Up @@ -181,55 +182,65 @@ def get_evidence(self, curie: str) -> Evidence:
res = self.read_query(query, curie=curie)
return res[0][0]

def summarize_predicates(self) -> Counter:
def summarize_predicates(self) -> t.Counter[str]:
"""Get a counter of predicates."""
query = "MATCH (m:mapping) RETURN m.predicate, count(m.predicate)"
return Counter(dict(self.read_query(query)))

def summarize_justifications(self) -> Counter:
def summarize_justifications(self) -> t.Counter[str]:
"""Get a counter of mapping justifications."""
query = "MATCH (e:evidence) RETURN e.mapping_justification, count(e.mapping_justification)"
return Counter({k.removeprefix("semapv:"): v for k, v in self.read_query(query)})

def summarize_evidence_types(self) -> Counter:
def summarize_evidence_types(self) -> t.Counter[str]:
query = "MATCH (e:evidence) RETURN e.type, count(e.type)"
return Counter(dict(self.read_query(query)))

def summarize_mapping_sets(self) -> Counter:
def summarize_mapping_sets(self) -> t.Counter[str]:
"""Get the number of evidences in each mapping set."""
query = "MATCH (e:evidence)-[:fromSet]->(s:mappingset) RETURN s.curie, count(e)"
return Counter(dict(self.read_query(query)))

def summarize_nodes(self) -> Counter:
def summarize_nodes(self) -> t.Counter[str]:
query = """\
MATCH (n:evidence) WITH count(n) as count RETURN 'Evidences' as label, count UNION ALL
MATCH (n:concept) WITH count(n) as count RETURN 'References' as label, count UNION ALL
MATCH (n:concept) WHERE n.priority WITH count(n) as count RETURN 'Concepts' as label, count UNION ALL
MATCH (n:concept) WITH count(n) as count RETURN 'Concepts' as label, count UNION ALL
MATCH (n:concept) WHERE n.priority WITH count(n) as count RETURN 'Equivalence Classes' as label, count UNION ALL
MATCH (n:mapping) WITH count(n) as count RETURN 'Mappings' as label, count UNION ALL
MATCH (n:mappingset) WITH count(n) as count RETURN 'Mapping Sets' as label, count
"""
return Counter(dict(self.read_query(query)))

def summarize_concepts(self) -> Counter:
def summarize_concepts(self) -> t.Counter[tuple[str, str]]:
query = "MATCH (e:concept) WHERE e.prefix <> 'orcid' RETURN e.prefix, count(e.prefix)"
return Counter(dict(self.read_query(query)))
return Counter({
(prefix, bioregistry.get_name(prefix)): count
for prefix, count in self.read_query(query)
})

def summarize_authors(self) -> Counter:
query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, count(e)"
return Counter(dict(self.read_query(query)))
def summarize_authors(self) -> t.Counter[tuple[str, str]]:
query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, a.name, count(e)"
return self._count_with_name(query)

def get_highest_exact_matches(self, limit: int = 10) -> Counter:
def get_highest_exact_matches(self, limit: int = 10) -> t.Counter[tuple[str, str]]:
query = """\
MATCH (a)-[:`skos:exactMatch`]-(b)
WHERE a.priority RETURN a.curie, count(distinct b) as c
WHERE a.priority
RETURN a.curie, a.name, count(distinct b) as c
ORDER BY c DESCENDING
LIMIT $limit
"""
return Counter(dict(self.read_query(query, limit=limit)))
return self._count_with_name(query, limit=limit)

def _count_with_name(self, query: str, **kwargs: Any) -> t.Counter[tuple[str, str]]:
return Counter({
(curie, name): count
for curie, name, count in self.read_query(query, **kwargs)
})

def get_exact_matches(self, curie: str) -> dict[Reference, str]:
query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b"
return {Reference.from_curie(node["curie"]): node["name"] for node, in self.read_query(query, curie=curie)}
query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN a.curie, a.name"
return {Reference.from_curie(n_curie): name for n_curie, name in self.read_query(query, curie=curie)}

def get_connected_component(self, curie: str) -> tuple[list[neo4j.graph.Node], list[neo4j.graph.Relationship]]:
query = """\
Expand Down
65 changes: 48 additions & 17 deletions src/semra/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pyobo
import pyobo.utils
from bioregistry import Collection
from tqdm.auto import tqdm
from tqdm import tqdm

from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING
from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
Expand All @@ -36,6 +36,12 @@
logger = logging.getLogger(__name__)


CONFIDENCE_PRECISION = 5
HAS_EVIDENCE_PREDICATE = "hasEvidence"
FROM_SET_PREDICATE = "fromSet"
DERIVED_PREDICATE = "derivedFromMapping"


def _safe_get_version(prefix: str) -> str | None:
try:
return bioversions.get_version(prefix)
Expand Down Expand Up @@ -353,6 +359,12 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat
def _get_name_by_curie(curie: str) -> str | None:
if any(curie.startswith(p) for p in SKIP_PREFIXES):
return None
if curie.startswith("orcid:"):
import requests

orcid = curie.removeprefix("orcid:")
res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}).json()
return res["person"]["name"]["given-names"]["value"] + " " + res["person"]["name"]["family-name"]["value"]
return pyobo.get_name_by_curie(curie)


Expand All @@ -374,7 +386,7 @@ def _get_sssom_row(mapping: Mapping, e: Evidence):
",".join(sorted(e.mapping_set_names)),
mapping_set_version,
mapping_set_license,
round(confidence, 4) if (confidence := e.confidence) is not None else "",
round(confidence, CONFIDENCE_PRECISION) if (confidence := e.confidence) is not None else "",
e.author.curie if e.author else "",
e.explanation,
)
Expand Down Expand Up @@ -416,7 +428,7 @@ def _edge_key(t):


def _neo4j_bool(b: bool, /) -> Literal["true", "false"]: # noqa:FBT001
return "true" if b else "false"
return "true" if b else "false" # type:ignore


def write_neo4j(
Expand All @@ -442,7 +454,16 @@ def write_neo4j(
equivalence_classes = {}

mapping_nodes_path = directory.joinpath("mapping_nodes.tsv")
mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence", "hasPrimary:boolean"]
mapping_nodes_header = [
"curie:ID",
":LABEL",
"prefix",
"predicate",
"confidence",
"primary:boolean",
"secondary:boolean",
"tertiary:boolean",
]

evidence_nodes_path = directory.joinpath("evidence_nodes.tsv")
evidences = {}
Expand All @@ -468,38 +489,46 @@ def write_neo4j(
]

edges_path = directory.joinpath("edges.tsv")
edges: list[tuple[str, str, str, str | float]] = []
edges: list[tuple[str, str, str, str | float, str, str, str, str]] = []
edges_header = [
":START_ID",
":TYPE",
":END_ID",
"confidence:float",
"hasPrimary:boolean",
"primary:boolean",
"secondary:boolean",
"tertiary:boolean",
"mapping_sets:string[]",
]

for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"):
mapping: Mapping
concepts.add(mapping.s)
concepts.add(mapping.o)

edges.append(
(
mapping.s.curie,
mapping.p.curie,
mapping.o.curie,
round(c, 4) if (c := mapping.confidence) is not None else "",
_neo4j_bool(mapping.has_primary_evidence),
round(c, CONFIDENCE_PRECISION) if (c := mapping.confidence) is not None else "",
_neo4j_bool(mapping.has_primary),
_neo4j_bool(mapping.has_secondary),
_neo4j_bool(mapping.has_tertiary),
"|".join(sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set})),
)
)
edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, ""))
edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, ""))
edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "", "", "", "", ""))
edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, "", "", "", "", ""))
for evidence in mapping.evidence:
edges.append((mapping.curie, "hasEvidence", evidence.curie, ""))
edges.append((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie, "", "", "", "", ""))
evidences[evidence.key()] = evidence
if evidence.mapping_set:
mapping_sets[evidence.mapping_set.name] = evidence.mapping_set
edges.append((evidence.curie, "fromSet", evidence.mapping_set.curie, ""))
edges.append((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie, "", "", "", "", ""))
elif isinstance(evidence, ReasonedEvidence):
for mmm in evidence.mappings:
edges.append((evidence.curie, "derivedFromMapping", mmm.curie, ""))
edges.append((evidence.curie, DERIVED_PREDICATE, mmm.curie, "", "", "", "", ""))
elif isinstance(evidence, SimpleEvidence):
pass
else:
Expand All @@ -508,7 +537,7 @@ def write_neo4j(
# Add authorship information for the evidence, if available
if evidence.author:
concepts.add(evidence.author)
edges.append((evidence.curie, "hasAuthor", evidence.author.curie, ""))
edges.append((evidence.curie, "hasAuthor", evidence.author.curie, "", "", "", "", ""))

_write_tsv(
concept_nodes_path,
Expand All @@ -518,7 +547,7 @@ def write_neo4j(
concept.curie,
"concept",
concept.prefix,
pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "",
_get_name_by_curie(concept.curie) or "" if add_labels else "",
_neo4j_bool(equivalence_classes.get(concept, False)),
)
for concept in sorted(concepts, key=lambda n: n.curie)
Expand All @@ -533,8 +562,10 @@ def write_neo4j(
"mapping",
"semra.mapping",
mapping.p.curie,
mapping.confidence and round(mapping.confidence, 4),
_neo4j_bool(mapping.has_primary_evidence),
mapping.confidence and round(mapping.confidence, CONFIDENCE_PRECISION),
_neo4j_bool(mapping.has_primary),
_neo4j_bool(mapping.has_secondary),
_neo4j_bool(mapping.has_tertiary),
)
for mapping in sorted(mappings, key=lambda n: n.curie)
),
Expand Down
20 changes: 18 additions & 2 deletions src/semra/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,9 +226,25 @@ def curie(self) -> str:
return self.get_reference().curie

@property
def has_primary_evidence(self) -> bool:
def has_primary(self) -> bool:
"""Get if there is a primary evidence associated with this mapping."""
return any(isinstance(evidence, SimpleEvidence) for evidence in self.evidence)
return any(
isinstance(evidence, SimpleEvidence) and evidence.mapping_set.name == self.s.prefix
for evidence in self.evidence
)

@property
def has_secondary(self) -> bool:
"""Get if there is a secondary evidence associated with this mapping."""
return any(
isinstance(evidence, SimpleEvidence) and evidence.mapping_set.name != self.s.prefix
for evidence in self.evidence
)

@property
def has_tertiary(self) -> bool:
"""Get if there are any tertiary (i.e., reasoned) evidences for this mapping."""
return any(not isinstance(evidence, SimpleEvidence) for evidence in self.evidence)


def line(*references: Reference) -> list[Mapping]:
Expand Down
15 changes: 6 additions & 9 deletions src/semra/templates/concept.html
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
</style>
<script src="https://code.jquery.com/jquery-3.1.1.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.26.0/cytoscape.min.js"></script>
<script src="https://unpkg.com/webcola/WebCola/cola.min.js"></script>
<script src="https://cytoscape.org/cytoscape.js-cola/cytoscape-cola.js"></script>
{% endblock %}

{% block scripts %}
Expand Down Expand Up @@ -52,24 +54,19 @@
"curve-style": "bezier",
'text-background-color': 'yellow',
'text-background-opacity': 0.4,
'width': '4px',
'width': '2px',
'target-arrow-shape': 'triangle',
'control-point-step-size': '140px',
autorotate: true
}
}
],
layout: {
name: 'cose',
animate: true,
padding: 100,
name: 'cola',
nodeSpacing: function( node ){ return 75; },
// nodeDimensionsIncludeLabels: true
}
});

var layout = cy.layout({ name: 'cose' });
layout.run();

layout.run();
});
</script>
{% endblock %}
Expand Down
19 changes: 13 additions & 6 deletions src/semra/templates/home.html
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@ <h2 style="margin-bottom: 0">{{ count }}{{ suffix }}</h2>
</div>
{% endmacro %}

{% macro do_table(counter, label, is_concept=False) %}
{% macro do_table(counter, label, is_concept=False, has_names=False) %}
<div class="card-body">
<h6>{{ label }} Summary</h6>
</div>
<table class="table">
<thead>
<tr>
<th>{{ label }}</th>
{% if has_names %}<th>Name</th>{% endif %}
<th>Count</th>
</tr>
</thead>
Expand All @@ -30,11 +31,17 @@ <h6>{{ label }} Summary</h6>
<tr>
<td><code>
{%- if is_concept -%}
{% if has_names %}
<a href="{{ url_for('view_concept', curie=key[0]) }}">{{ key[0] }}</a>
{% else %}
FAILURE!
<a href="{{ url_for('view_concept', curie=key) }}">{{ key }}</a>
{% endif %}
{%- else -%}
{{ key }}
{% if has_names %}{{ key[0] }}{% else %}{{ key }}{% endif %}
{%- endif -%}
</code></td>
{% if has_names %}<td>{{ key[1] }}</td>{% endif %}
<td align="right">{{ "{:,}".format(count) }}</td>
</tr>
{% endfor %}
Expand All @@ -53,8 +60,8 @@ <h5 class="card-title">
Semantic Reasoning Assembler
</h5>
<div class="row text-center stats" style="padding-top: 1em; padding-bottom: 1em;">
{{ count_column(node_counter, "References", "dna") }}
{{ count_column(node_counter, "Concepts", "dna") }}
{{ count_column(node_counter, "Equivalence Classes", "dna") }}
{{ count_column(node_counter, "Mappings", "arrows-alt") }}
{{ count_column(node_counter, "Evidences", "book") }}
{{ count_column(node_counter, "Mapping Sets", "puzzle-piece") }}
Expand Down Expand Up @@ -94,9 +101,9 @@ <h6>Mapping Sets</h6>
{{ do_table(evidence_type_counter, "Evidence Type") }}
{{ do_table(predicate_counter, "Predicate") }}
{{ do_table(justification_counter, "Mapping Justification") }}
{{ do_table(prefix_counter, "Prefix") }}
{{ do_table(author_counter, "Author") }}
{{ do_table(high_matches_counter, "Potential Data Issues", is_concept=True) }}
{{ do_table(prefix_counter, "Prefix", has_names=True) }}
{{ do_table(author_counter, "Author", has_names=True) }}
{{ do_table(high_matches_counter, "Potential Data Issues", is_concept=True, has_names=True) }}

<div class="card-body">
<h6>Example Mappings</h6>
Expand Down

0 comments on commit e169041

Please sign in to comment.