diff --git a/pyproject.toml b/pyproject.toml index fb4658f..81f98f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "tqdm", "more_itertools", "networkx", + "bioregistry", "bioontologies", "pyobo", ] diff --git a/src/semra/client.py b/src/semra/client.py index dfffe0b..a01fc7b 100644 --- a/src/semra/client.py +++ b/src/semra/client.py @@ -11,6 +11,7 @@ import pydantic from neo4j import Transaction, unit_of_work +import bioregistry import semra from semra import Evidence, MappingSet, Reference from semra.io import _get_name_by_curie @@ -181,55 +182,65 @@ def get_evidence(self, curie: str) -> Evidence: res = self.read_query(query, curie=curie) return res[0][0] - def summarize_predicates(self) -> Counter: + def summarize_predicates(self) -> t.Counter[str]: """Get a counter of predicates.""" query = "MATCH (m:mapping) RETURN m.predicate, count(m.predicate)" return Counter(dict(self.read_query(query))) - def summarize_justifications(self) -> Counter: + def summarize_justifications(self) -> t.Counter[str]: """Get a counter of mapping justifications.""" query = "MATCH (e:evidence) RETURN e.mapping_justification, count(e.mapping_justification)" return Counter({k.removeprefix("semapv:"): v for k, v in self.read_query(query)}) - def summarize_evidence_types(self) -> Counter: + def summarize_evidence_types(self) -> t.Counter[str]: query = "MATCH (e:evidence) RETURN e.type, count(e.type)" return Counter(dict(self.read_query(query))) - def summarize_mapping_sets(self) -> Counter: + def summarize_mapping_sets(self) -> t.Counter[str]: """Get the number of evidences in each mapping set.""" query = "MATCH (e:evidence)-[:fromSet]->(s:mappingset) RETURN s.curie, count(e)" return Counter(dict(self.read_query(query))) - def summarize_nodes(self) -> Counter: + def summarize_nodes(self) -> t.Counter[str]: query = """\ MATCH (n:evidence) WITH count(n) as count RETURN 'Evidences' as label, count UNION ALL - MATCH (n:concept) WITH count(n) as count RETURN 'References' as label, count UNION ALL - MATCH (n:concept) WHERE n.priority WITH count(n) as count RETURN 'Concepts' as label, count UNION ALL + MATCH (n:concept) WITH count(n) as count RETURN 'Concepts' as label, count UNION ALL + MATCH (n:concept) WHERE n.priority WITH count(n) as count RETURN 'Equivalence Classes' as label, count UNION ALL MATCH (n:mapping) WITH count(n) as count RETURN 'Mappings' as label, count UNION ALL MATCH (n:mappingset) WITH count(n) as count RETURN 'Mapping Sets' as label, count """ return Counter(dict(self.read_query(query))) - def summarize_concepts(self) -> Counter: + def summarize_concepts(self) -> t.Counter[tuple[str, str]]: query = "MATCH (e:concept) WHERE e.prefix <> 'orcid' RETURN e.prefix, count(e.prefix)" - return Counter(dict(self.read_query(query))) + return Counter({ + (prefix, bioregistry.get_name(prefix)): count + for prefix, count in self.read_query(query) + }) - def summarize_authors(self) -> Counter: - query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, count(e)" - return Counter(dict(self.read_query(query))) + def summarize_authors(self) -> t.Counter[tuple[str, str]]: + query = "MATCH (e:evidence)-[:hasAuthor]->(a:concept) RETURN a.curie, a.name, count(e)" + return self._count_with_name(query) - def get_highest_exact_matches(self, limit: int = 10) -> Counter: + def get_highest_exact_matches(self, limit: int = 10) -> t.Counter[tuple[str, str]]: query = """\ MATCH (a)-[:`skos:exactMatch`]-(b) - WHERE a.priority RETURN a.curie, count(distinct b) as c + WHERE a.priority + RETURN a.curie, a.name, count(distinct b) as c ORDER BY c DESCENDING LIMIT $limit """ - return Counter(dict(self.read_query(query, limit=limit))) + return self._count_with_name(query, limit=limit) + + def _count_with_name(self, query: str, **kwargs: Any) -> t.Counter[tuple[str, str]]: + return Counter({ + (curie, name): count + for curie, name, count in self.read_query(query, **kwargs) + }) def get_exact_matches(self, curie: str) -> dict[Reference, str]: - query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN b" - return {Reference.from_curie(node["curie"]): node["name"] for node, in self.read_query(query, curie=curie)} + query = "MATCH (a {curie: $curie})-[:`skos:exactMatch`]-(b) RETURN a.curie, a.name" + return {Reference.from_curie(n_curie): name for n_curie, name in self.read_query(query, curie=curie)} def get_connected_component(self, curie: str) -> tuple[list[neo4j.graph.Node], list[neo4j.graph.Relationship]]: query = """\ diff --git a/src/semra/io.py b/src/semra/io.py index 371ccf1..63871bb 100644 --- a/src/semra/io.py +++ b/src/semra/io.py @@ -15,7 +15,7 @@ import pyobo import pyobo.utils from bioregistry import Collection -from tqdm.auto import tqdm +from tqdm import tqdm from semra.rules import DB_XREF, MANUAL_MAPPING, UNSPECIFIED_MAPPING from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence @@ -36,6 +36,12 @@ logger = logging.getLogger(__name__) +CONFIDENCE_PRECISION = 5 +HAS_EVIDENCE_PREDICATE = "hasEvidence" +FROM_SET_PREDICATE = "fromSet" +DERIVED_PREDICATE = "derivedFromMapping" + + def _safe_get_version(prefix: str) -> str | None: try: return bioversions.get_version(prefix) @@ -353,6 +359,12 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat def _get_name_by_curie(curie: str) -> str | None: if any(curie.startswith(p) for p in SKIP_PREFIXES): return None + if curie.startswith("orcid:"): + import requests + + orcid = curie.removeprefix("orcid:") + res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}).json() + return res["person"]["name"]["given-names"]["value"] + " " + res["person"]["name"]["family-name"]["value"] return pyobo.get_name_by_curie(curie) @@ -374,7 +386,7 @@ def _get_sssom_row(mapping: Mapping, e: Evidence): ",".join(sorted(e.mapping_set_names)), mapping_set_version, mapping_set_license, - round(confidence, 4) if (confidence := e.confidence) is not None else "", + round(confidence, CONFIDENCE_PRECISION) if (confidence := e.confidence) is not None else "", e.author.curie if e.author else "", e.explanation, ) @@ -416,7 +428,7 @@ def _edge_key(t): def _neo4j_bool(b: bool, /) -> Literal["true", "false"]: # noqa:FBT001 - return "true" if b else "false" + return "true" if b else "false" # type:ignore def write_neo4j( @@ -442,7 +454,16 @@ def write_neo4j( equivalence_classes = {} mapping_nodes_path = directory.joinpath("mapping_nodes.tsv") - mapping_nodes_header = ["curie:ID", ":LABEL", "prefix", "predicate", "confidence", "hasPrimary:boolean"] + mapping_nodes_header = [ + "curie:ID", + ":LABEL", + "prefix", + "predicate", + "confidence", + "primary:boolean", + "secondary:boolean", + "tertiary:boolean", + ] evidence_nodes_path = directory.joinpath("evidence_nodes.tsv") evidences = {} @@ -468,38 +489,46 @@ def write_neo4j( ] edges_path = directory.joinpath("edges.tsv") - edges: list[tuple[str, str, str, str | float]] = [] + edges: list[tuple[str, str, str, str | float, str, str, str, str]] = [] edges_header = [ ":START_ID", ":TYPE", ":END_ID", "confidence:float", - "hasPrimary:boolean", + "primary:boolean", + "secondary:boolean", + "tertiary:boolean", + "mapping_sets:string[]", ] for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"): + mapping: Mapping concepts.add(mapping.s) concepts.add(mapping.o) + edges.append( ( mapping.s.curie, mapping.p.curie, mapping.o.curie, - round(c, 4) if (c := mapping.confidence) is not None else "", - _neo4j_bool(mapping.has_primary_evidence), + round(c, CONFIDENCE_PRECISION) if (c := mapping.confidence) is not None else "", + _neo4j_bool(mapping.has_primary), + _neo4j_bool(mapping.has_secondary), + _neo4j_bool(mapping.has_tertiary), + "|".join(sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set})), ) ) - edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "")) - edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, "")) + edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "", "", "", "", "")) + edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, "", "", "", "", "")) for evidence in mapping.evidence: - edges.append((mapping.curie, "hasEvidence", evidence.curie, "")) + edges.append((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie, "", "", "", "", "")) evidences[evidence.key()] = evidence if evidence.mapping_set: mapping_sets[evidence.mapping_set.name] = evidence.mapping_set - edges.append((evidence.curie, "fromSet", evidence.mapping_set.curie, "")) + edges.append((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie, "", "", "", "", "")) elif isinstance(evidence, ReasonedEvidence): for mmm in evidence.mappings: - edges.append((evidence.curie, "derivedFromMapping", mmm.curie, "")) + edges.append((evidence.curie, DERIVED_PREDICATE, mmm.curie, "", "", "", "", "")) elif isinstance(evidence, SimpleEvidence): pass else: @@ -508,7 +537,7 @@ def write_neo4j( # Add authorship information for the evidence, if available if evidence.author: concepts.add(evidence.author) - edges.append((evidence.curie, "hasAuthor", evidence.author.curie, "")) + edges.append((evidence.curie, "hasAuthor", evidence.author.curie, "", "", "", "", "")) _write_tsv( concept_nodes_path, @@ -518,7 +547,7 @@ def write_neo4j( concept.curie, "concept", concept.prefix, - pyobo.get_name_by_curie(concept.curie) or "" if add_labels else "", + _get_name_by_curie(concept.curie) or "" if add_labels else "", _neo4j_bool(equivalence_classes.get(concept, False)), ) for concept in sorted(concepts, key=lambda n: n.curie) @@ -533,8 +562,10 @@ def write_neo4j( "mapping", "semra.mapping", mapping.p.curie, - mapping.confidence and round(mapping.confidence, 4), - _neo4j_bool(mapping.has_primary_evidence), + mapping.confidence and round(mapping.confidence, CONFIDENCE_PRECISION), + _neo4j_bool(mapping.has_primary), + _neo4j_bool(mapping.has_secondary), + _neo4j_bool(mapping.has_tertiary), ) for mapping in sorted(mappings, key=lambda n: n.curie) ), diff --git a/src/semra/struct.py b/src/semra/struct.py index 8231492..d119db2 100644 --- a/src/semra/struct.py +++ b/src/semra/struct.py @@ -226,9 +226,25 @@ def curie(self) -> str: return self.get_reference().curie @property - def has_primary_evidence(self) -> bool: + def has_primary(self) -> bool: """Get if there is a primary evidence associated with this mapping.""" - return any(isinstance(evidence, SimpleEvidence) for evidence in self.evidence) + return any( + isinstance(evidence, SimpleEvidence) and evidence.mapping_set.name == self.s.prefix + for evidence in self.evidence + ) + + @property + def has_secondary(self) -> bool: + """Get if there is a secondary evidence associated with this mapping.""" + return any( + isinstance(evidence, SimpleEvidence) and evidence.mapping_set.name != self.s.prefix + for evidence in self.evidence + ) + + @property + def has_tertiary(self) -> bool: + """Get if there are any tertiary (i.e., reasoned) evidences for this mapping.""" + return any(not isinstance(evidence, SimpleEvidence) for evidence in self.evidence) def line(*references: Reference) -> list[Mapping]: diff --git a/src/semra/templates/concept.html b/src/semra/templates/concept.html index 3f57273..55e341b 100644 --- a/src/semra/templates/concept.html +++ b/src/semra/templates/concept.html @@ -15,6 +15,8 @@ + + {% endblock %} {% block scripts %} @@ -52,7 +54,7 @@ "curve-style": "bezier", 'text-background-color': 'yellow', 'text-background-opacity': 0.4, - 'width': '4px', + 'width': '2px', 'target-arrow-shape': 'triangle', 'control-point-step-size': '140px', autorotate: true @@ -60,16 +62,11 @@ } ], layout: { - name: 'cose', - animate: true, - padding: 100, + name: 'cola', + nodeSpacing: function( node ){ return 75; }, + // nodeDimensionsIncludeLabels: true } }); - - var layout = cy.layout({ name: 'cose' }); - layout.run(); - -layout.run(); }); {% endblock %} diff --git a/src/semra/templates/home.html b/src/semra/templates/home.html index 9b8c8bf..8fc96dc 100644 --- a/src/semra/templates/home.html +++ b/src/semra/templates/home.html @@ -14,7 +14,7 @@

{{ count }}{{ suffix }}

{% endmacro %} -{% macro do_table(counter, label, is_concept=False) %} +{% macro do_table(counter, label, is_concept=False, has_names=False) %}
{{ label }} Summary
@@ -22,6 +22,7 @@
{{ label }} Summary
{{ label }} + {% if has_names %}Name{% endif %} Count @@ -30,11 +31,17 @@
{{ label }} Summary
{%- if is_concept -%} + {% if has_names %} + {{ key[0] }} + {% else %} + FAILURE! {{ key }} + {% endif %} {%- else -%} - {{ key }} + {% if has_names %}{{ key[0] }}{% else %}{{ key }}{% endif %} {%- endif -%} + {% if has_names %}{{ key[1] }}{% endif %} {{ "{:,}".format(count) }} {% endfor %} @@ -53,8 +60,8 @@
Semantic Reasoning Assembler
- {{ count_column(node_counter, "References", "dna") }} {{ count_column(node_counter, "Concepts", "dna") }} + {{ count_column(node_counter, "Equivalence Classes", "dna") }} {{ count_column(node_counter, "Mappings", "arrows-alt") }} {{ count_column(node_counter, "Evidences", "book") }} {{ count_column(node_counter, "Mapping Sets", "puzzle-piece") }} @@ -94,9 +101,9 @@
Mapping Sets
{{ do_table(evidence_type_counter, "Evidence Type") }} {{ do_table(predicate_counter, "Predicate") }} {{ do_table(justification_counter, "Mapping Justification") }} - {{ do_table(prefix_counter, "Prefix") }} - {{ do_table(author_counter, "Author") }} - {{ do_table(high_matches_counter, "Potential Data Issues", is_concept=True) }} + {{ do_table(prefix_counter, "Prefix", has_names=True) }} + {{ do_table(author_counter, "Author", has_names=True) }} + {{ do_table(high_matches_counter, "Potential Data Issues", is_concept=True, has_names=True) }}
Example Mappings