From f8808c8a1a037bfa8dae104746f5cd548afc336d Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Fri, 12 Jul 2024 09:07:51 -0400 Subject: [PATCH 01/21] Initial implementation of resource adding endpoint --- mira/dkg/api.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index c109beb1..36709a43 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -13,6 +13,7 @@ from mira.dkg.client import AskemEntity, Entity, Relation from mira.dkg.utils import DKG_REFINER_RELS +from mira.dkg.construct import process_resource __all__ = [ "api_blueprint", @@ -360,6 +361,31 @@ def add_relations( request.app.state.client.add_relation(relation) + @api_blueprint.post( + "/add_resources", + response_model=None, + tags=["relations"], + ) + def add_resources( + request: Request, + resource_list: List[str] + ): + for resource in resource_list: + # nodes and edges will be a list of dicts + nodes, edges = process_resource(resource) + + # node_info and edge_info are dictionaries that will be + # unpacked when creating instances of entities and relations + entities = [Entity(**node_info) for node_info in nodes] + relations = [Relation(**edge_info) for edge_info in edges] + + for entity in entities: + request.app.state.client.add_node(entity) + for relation in relations: + request.app.state.client.add_relation(relation) + + + class IsOntChildResult(BaseModel): """Result of a query to /is_ontological_child""" From 88bc327d04a23a77538a670cbf74e02d1d9e2fce Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Fri, 12 Jul 2024 17:54:48 -0400 Subject: [PATCH 02/21] Fix validation error for starting uvicorn instance and add retrieving probonto resource to add_resource endpoint, fix errors for adding synonyms, xrefs, properties info for nodes --- mira/dkg/api.py | 10 ++++-- mira/dkg/client.py | 41 +++++++++++++++------- mira/dkg/construct.py | 82 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 117 insertions(+), 16 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 36709a43..84fa0f74 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -2,7 +2,7 @@ import itertools as itt import os -from typing import Any, List, Mapping, Optional, Union, Dict +from typing import Any, List, Mapping, Optional, Union import pydantic from fastapi import APIRouter, Body, HTTPException, Path, Query, Request @@ -368,12 +368,16 @@ def add_relations( ) def add_resources( request: Request, - resource_list: List[str] + resource_list: List[str] = Body( + ..., + description="A of resources to add to the DKG", + title="Resource Prefixes", + example=["probonto"], + ) ): for resource in resource_list: # nodes and edges will be a list of dicts nodes, edges = process_resource(resource) - # node_info and edge_info are dictionaries that will be # unpacked when creating instances of entities and relations entities = [Entity(**node_info) for node_info in nodes] diff --git a/mira/dkg/client.py b/mira/dkg/client.py index 7b9e2d16..5288fe76 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -3,6 +3,7 @@ import itertools as itt import logging import os +import json from collections import Counter, defaultdict from difflib import SequenceMatcher from functools import lru_cache @@ -352,20 +353,34 @@ def add_node(self, entity): alts = entity.alts xrefs = entity.xrefs labels = entity.labels + properties = entity.properties create_source_node_query = ( - f"MERGE (n {{curie: '{curie}', " - f"name: '{name}', " - f"type: '{type}', " - f"obsolete: {obsolete}, " - f"description: '{description}', " - f"synonyms: {synonyms}, " - f"alts: {alts}, " - f"xrefs: {xrefs}, " - f"labels: {labels} }} )" + "MERGE (n {curie: $curie, " + "name: $name, " + "type: $type, " + "obsolete: $obsolete, " + "description: $description, " + "synonyms: $synonyms, " + "alts: $alts, " + "xrefs: $xrefs, " + "labels: $labels, " + "properties: $properties})" ) - - self.create_tx(create_source_node_query) + query_parameters = { + "curie": curie, + "name": name, + "type": type, + "obsolete": obsolete, + "description": description, + "synonyms": json.dumps([synonym.dict() for synonym in synonyms]), + "alts": alts, + "xrefs": json.dumps([xref.dict() for xref in xrefs]), + "labels": labels, + "properties": json.dumps(properties) + } + + self.create_tx(create_source_node_query, **query_parameters) def add_relation(self, relation): """Add a relation to the DKG @@ -521,7 +536,9 @@ def get_grounder_terms(self, prefix: str) -> List["gilda.term.Term"]: def get_lexical(self) -> List[Entity]: """Get Lexical information for all entities.""" - query = f"MATCH (n) WHERE NOT n.obsolete and EXISTS(n.name) RETURN n" + query = (f"MATCH (n) WHERE NOT n.obsolete and EXISTS(n.name)" + f" and EXISTS(n.type) " + f"RETURN n") return [Entity.from_data(n) for n, in self.query_tx(query) or []] def get_grounder(self, prefix: Union[str, List[str]]) -> "gilda.grounder.Grounder": diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index c1b0b046..71c995ae 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -33,6 +33,7 @@ import biomappings import bioontologies import click +import pydantic.error_wrappers import pyobo import pystow from bioontologies import obograph @@ -54,6 +55,7 @@ from mira.dkg.physical_constants import get_physical_constant_terms from mira.dkg.constants import EDGE_HEADER, NODE_HEADER from mira.dkg.utils import PREFIXES +from mira.dkg.client import Synonym, Xref MODULE = pystow.module("mira") DEMO_MODULE = MODULE.module("demo", "import") @@ -199,6 +201,75 @@ class NodeInfo(NamedTuple): synonym_types: str +def get_probonto_data(): + probonto_edges = [] + nodes = [] + for term in tqdm( + get_probonto_terms(), unit="term", desc="Loading probonto" + ): + curie, name, parameters = ( + term["curie"], + term["name"], + term["parameters"], + ) + nodes.append( + { + "id": curie, + "name": name, + "type": "class", + "description": "", + "obsolete": False, + "xrefs": [Xref(id=eq.get("curie", ""), type=eq.get("name", "")) + for eq in term.get("equivalent", [])] + } + ) + + for parameter in term.get("parameters", []): + parameter_curie, parameter_name = ( + parameter["curie"], + parameter["name"], + ) + synonyms = [] + synonym_types = [] + parameter_symbol = parameter.get("symbol") + if parameter_symbol: + synonyms.append(parameter_symbol) + synonym_types.append("referenced_by_latex") + parameter_short = parameter.get("short_name") + if parameter_short: + synonyms.append(parameter_short) + synonym_types.append("oboInOwl:hasExactSynonym") + synonyms_list = [Synonym(value=value, type=type) for value, type in + zip(synonyms, synonym_types)] + nodes.append( + { + "id": parameter_curie, + "name": parameter_name, + "type": "class", + "description": "", + "obsolete": False, + "synonyms": synonyms_list + } + ) + probonto_edges.append( + { + "source_curie": curie, + "target_curie": parameter_curie, + "type": "has_parameter", + "pred": "probonto:c0000062", + "source": "probonto", + "graph": "https://raw.githubusercontent.com/probonto/ontologymaster/probonto4ols.owl", + "version": "2.5", + } + ) + return nodes, probonto_edges + + +def process_resource(resource_prefix: str): + if resource_prefix == "probonto": + return get_probonto_data() + + @click.command() @click.option( "--add-xref-edges", @@ -254,7 +325,14 @@ def construct( edge_names = {} for edge_prefix in DEFAULT_VOCABS: click.secho(f"Caching {manager.get_name(edge_prefix)}", fg="green", bold=True) - parse_results = bioontologies.get_obograph_by_prefix(edge_prefix) + try: + parse_results = bioontologies.get_obograph_by_prefix(edge_prefix) + except pydantic.error_wrappers.ValidationError: + print(f"VALIDATE NODE GRAPH ERROR {edge_prefix}") + continue + if not parse_results.graph_document: + print(f"EMPTY GRAPH {edge_prefix}") + continue for edge_graph in parse_results.graph_document.graphs: edge_graph = edge_graph.standardize() for edge_node in edge_graph.nodes: @@ -931,6 +1009,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) writer.writerow(EDGE_HEADER) for prefix, edge_path in tqdm(sorted(use_case_paths.EDGES_PATHS.items()), desc="cat edges"): + if not edge_path.is_file(): + continue with edge_path.open() as edge_file: reader = csv.reader(edge_file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) _header = next(reader) From 42e04785d9f87527111319ae0337cad21b08bf1a Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Fri, 12 Jul 2024 18:04:22 -0400 Subject: [PATCH 03/21] Add properties to probonto nodes --- mira/dkg/construct.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 71c995ae..f13ad687 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -212,6 +212,11 @@ def get_probonto_data(): term["name"], term["parameters"], ) + properties = { + "has_parameter": [parameter["name"].replace("\n", " ") for parameter + in + parameters] + } nodes.append( { "id": curie, @@ -220,10 +225,11 @@ def get_probonto_data(): "description": "", "obsolete": False, "xrefs": [Xref(id=eq.get("curie", ""), type=eq.get("name", "")) - for eq in term.get("equivalent", [])] + for eq in term.get("equivalent", [])], + "properties": properties + } ) - for parameter in term.get("parameters", []): parameter_curie, parameter_name = ( parameter["curie"], From ff74798690e12fbec6dd9e4387f7633509698dfc Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Mon, 15 Jul 2024 15:03:15 -0400 Subject: [PATCH 04/21] Process epi use case nodes and edges --- mira/dkg/api.py | 2 +- mira/dkg/construct.py | 118 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 111 insertions(+), 9 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 84fa0f74..d811931c 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -377,7 +377,7 @@ def add_resources( ): for resource in resource_list: # nodes and edges will be a list of dicts - nodes, edges = process_resource(resource) + nodes, edges = process_resource(resource.lower()) # node_info and edge_info are dictionaries that will be # unpacked when creating instances of entities and relations entities = [Entity(**node_info) for node_info in nodes] diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index f13ad687..e200ad68 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -56,6 +56,7 @@ from mira.dkg.constants import EDGE_HEADER, NODE_HEADER from mira.dkg.utils import PREFIXES from mira.dkg.client import Synonym, Xref +from .resources.geonames import get_geonames_terms MODULE = pystow.module("mira") DEMO_MODULE = MODULE.module("demo", "import") @@ -201,9 +202,8 @@ class NodeInfo(NamedTuple): synonym_types: str -def get_probonto_data(): - probonto_edges = [] - nodes = [] +def extract_probonto_nodes_edges(): + probonto_nodes, probonto_edges = [], [] for term in tqdm( get_probonto_terms(), unit="term", desc="Loading probonto" ): @@ -217,7 +217,7 @@ def get_probonto_data(): in parameters] } - nodes.append( + probonto_nodes.append( { "id": curie, "name": name, @@ -247,7 +247,7 @@ def get_probonto_data(): synonym_types.append("oboInOwl:hasExactSynonym") synonyms_list = [Synonym(value=value, type=type) for value, type in zip(synonyms, synonym_types)] - nodes.append( + probonto_nodes.append( { "id": parameter_curie, "name": parameter_name, @@ -268,12 +268,115 @@ def get_probonto_data(): "version": "2.5", } ) - return nodes, probonto_edges + return probonto_nodes, probonto_edges + + +def extract_geonames_nodes_edges(): + geonames_nodes, geonames_edges = [], [] + for term in tqdm(get_geonames_terms(), unit="term", desc="Geonames"): + geonames_nodes.append( + { + "id": term.curie, + "name": term.name, + "type": "individual", + "description": term.definition, + "obsolete": False if not term.is_obsolete else True, + "synonyms": term.synonyms, + "alts": term.alt_ids, + "xrefs": term.xrefs, + "properties": term.properties, + } + ) + for parent in term.get_relationships(part_of): + geonames_edges.append( + ( + term.curie, + parent.curie, + "part_of", + part_of.curie.lower(), + "geonames", + "geonames", + "", + ) + ) + return geonames_nodes, geonames_edges + + +def extract_ncit_nodes_edges(): + ncit_nodes, ncit_edges = [], [] + for term in tqdm(get_ncit_subset(), unit="term", desc="NCIT"): + ncit_nodes.append( + { + "id": term.curie, + "name": term.name, + "type": "class", + "description": term.definition, + "obsolete": False if not term.is_obsolete else True, + "synonyms": term.synonyms, + "alts": term.alt_ids, + "xrefs": term.xrefs, + "properties": term.properties, + } + ) + for parent in term.get_relationships(part_of): + ncit_edges.append( + ( + term.curie, + parent.curie, + "part_of", + part_of.curie.lower(), + "ncit", + "ncit", + "", + ) + ) + + +def extract_ncbitaxon_nodes_edges(): + ncbitaxon_nodes, ncbitaxon_edges = [], [] + for term in tqdm(get_ncbitaxon(), unit="term", desc="NCBITaxon"): + ncbitaxon_nodes.append( + { + "id": term.curie, + "name": term.name, + "type": "class", + "description": term.definition, + "obsolete": False if not term.is_obsolete else True, + "synonyms": term.synonyms, + "alts": term.alt_ids, + "xrefs": term.xrefs, + "properties": term.properties, + } + ) + for parent in term.get_relationships(part_of): + ncbitaxon_edges.append( + ( + term.curie, + parent.curie, + "part_of", + part_of.curie.lower(), + "ncbitaxon", + "ncbitaxon", + "", + ) + ) def process_resource(resource_prefix: str): if resource_prefix == "probonto": - return get_probonto_data() + return extract_probonto_nodes_edges() + elif resource_prefix == "geonames": + return extract_geonames_nodes_edges() + elif resource_prefix == "ncit": + return extract_ncit_nodes_edges() + elif resource_prefix == "ncbitaxon": + return extract_ncbitaxon_nodes_edges() + elif resource_prefix == "eiffel": + pass + elif resource_prefix == "cso": + pass + elif resource_prefix == "wikidata": + pass @click.command() @@ -534,7 +637,6 @@ def construct( writer.writerow(EDGE_HEADER) writer.writerows(eiffel_edges) if use_case == "epi": - from .resources.geonames import get_geonames_terms geonames_edges = [] for term in tqdm(get_geonames_terms(), unit="term", desc="Geonames"): node_sources[term.curie].add("geonames") From 831c7d1d79321c2ff00dd7b88f70a8a912a34a0b Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Mon, 15 Jul 2024 16:13:48 -0400 Subject: [PATCH 05/21] Add climate nodes and edges, add kwarg to from_obo_path to extract cso data --- mira/dkg/construct.py | 91 +++++++++++++++++++++++++++++++++------ mira/dkg/resources/cso.py | 3 +- 2 files changed, 79 insertions(+), 15 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index e200ad68..619eaa18 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -56,7 +56,10 @@ from mira.dkg.constants import EDGE_HEADER, NODE_HEADER from mira.dkg.utils import PREFIXES from mira.dkg.client import Synonym, Xref -from .resources.geonames import get_geonames_terms +from mira.dkg.resources.cso import get_cso_obo +from mira.dkg.resources.geonames import get_geonames_terms +from mira.dkg.resources.extract_eiffel_ontology import get_eiffel_ontology_terms +from mira.dkg.resources.uat import get_uat MODULE = pystow.module("mira") DEMO_MODULE = MODULE.module("demo", "import") @@ -204,9 +207,7 @@ class NodeInfo(NamedTuple): def extract_probonto_nodes_edges(): probonto_nodes, probonto_edges = [], [] - for term in tqdm( - get_probonto_terms(), unit="term", desc="Loading probonto" - ): + for term in tqdm(get_probonto_terms(), unit="term"): curie, name, parameters = ( term["curie"], term["name"], @@ -273,7 +274,7 @@ def extract_probonto_nodes_edges(): def extract_geonames_nodes_edges(): geonames_nodes, geonames_edges = [], [] - for term in tqdm(get_geonames_terms(), unit="term", desc="Geonames"): + for term in tqdm(get_geonames_terms(), unit="term"): geonames_nodes.append( { "id": term.curie, @@ -304,7 +305,7 @@ def extract_geonames_nodes_edges(): def extract_ncit_nodes_edges(): ncit_nodes, ncit_edges = [], [] - for term in tqdm(get_ncit_subset(), unit="term", desc="NCIT"): + for term in tqdm(get_ncit_subset(), unit="term"): ncit_nodes.append( { "id": term.curie, @@ -334,7 +335,7 @@ def extract_ncit_nodes_edges(): def extract_ncbitaxon_nodes_edges(): ncbitaxon_nodes, ncbitaxon_edges = [], [] - for term in tqdm(get_ncbitaxon(), unit="term", desc="NCBITaxon"): + for term in tqdm(get_ncbitaxon(), unit="term"): ncbitaxon_nodes.append( { "id": term.curie, @@ -362,6 +363,72 @@ def extract_ncbitaxon_nodes_edges(): ) +def extract_eiffel_nodes_edges(): + eiffel_nodes, eiffel_edges = [], [] + for term in tqdm(get_eiffel_ontology_terms(), unit="term"): + eiffel_nodes.append( + { + "id": term.curie, + "name": term.name, + "type": "class", + "description": term.definition, + "obsolete": False if not term.is_obsolete else True, + "synonyms": term.synonyms, + "alts": term.alt_ids, + "xrefs": term.xrefs, + "properties": term.properties, + } + ) + for typedef, object_references in term.relationships.items(): + for object_reference in object_references: + eiffel_edges.append( + ( + term.curie, + object_reference.curie, + typedef.name.replace(" ", "").lower(), + typedef.curie, + "eiffel", + "eiffel", + "", + ) + ) + return eiffel_nodes, eiffel_edges + + +def extract_cso_nodes_edges(): + cso_nodes, cso_edges = [], [] + for term in get_cso_obo().iter_terms(): + cso_nodes.append( + { + "id": term.curie, + "name": term.name, + "type": "class", + "description": term.definition, + "obsolete": False if not term.is_obsolete else True, + "synonyms": term.synonyms, + "alts": term.alt_ids, + "xrefs": term.xrefs, + "properties": term.properties, + } + ) + for parent in term.get_relationship(part_of): + cso_edges.append( + ( + term.curie, + parent.curie, + "part_of", + part_of.curie.lower(), + "cso", + "cso", + "", + ) + ) + + +def extract_wikidata_nodes_edges(): + pass + + def process_resource(resource_prefix: str): if resource_prefix == "probonto": return extract_probonto_nodes_edges() @@ -372,11 +439,11 @@ def process_resource(resource_prefix: str): elif resource_prefix == "ncbitaxon": return extract_ncbitaxon_nodes_edges() elif resource_prefix == "eiffel": - pass + return extract_eiffel_nodes_edges() elif resource_prefix == "cso": - pass + return extract_cso_nodes_edges() elif resource_prefix == "wikidata": - pass + return extract_wikidata_nodes_edges() @click.command() @@ -406,7 +473,6 @@ def main( use_case = config.use_case else: config = None - construct( use_case=use_case, config=config, @@ -606,13 +672,11 @@ def construct( writer.writerows(probonto_edges) if use_case == "climate": - from .resources.cso import get_cso_obo for term in get_cso_obo().iter_terms(): node_sources[term.curie].add("cso") nodes[term.curie] = get_node_info(term) - from .resources.extract_eiffel_ontology import get_eiffel_ontology_terms eiffel_edges = [] for term in tqdm(get_eiffel_ontology_terms(), unit="term", desc="Eiffel"): @@ -671,7 +735,6 @@ def construct( # TODO add edges to source file later, if important if use_case == "space": - from .resources.uat import get_uat uat_ontology = get_uat() uat_edges = [] diff --git a/mira/dkg/resources/cso.py b/mira/dkg/resources/cso.py index 88e8d1da..2cde3264 100644 --- a/mira/dkg/resources/cso.py +++ b/mira/dkg/resources/cso.py @@ -23,7 +23,8 @@ def get_cso_obo() -> Obo: ) download(url=URL, path=PATH) # use https://github.com/pyobo/pyobo/pull/159 - return from_obo_path(PATH, prefix="cso", default_prefix="cso", strict=False) + kwargs = {"default_prefix": "cso"} + return from_obo_path(PATH, prefix="cso", strict=False, **kwargs) if __name__ == "__main__": From 20abbb963ca189ba4eb3006c2b1afb7f05337d86 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Mon, 15 Jul 2024 17:05:27 -0400 Subject: [PATCH 06/21] Add wikidata nodes and edges, update add_resources endpoint example, use better variable names, revert kwarg addition to 'from_obo_path' call --- mira/dkg/api.py | 11 +++++---- mira/dkg/client.py | 1 - mira/dkg/construct.py | 50 +++++++++++++++++++++++++++++++++++++-- mira/dkg/resources/cso.py | 5 ++-- 4 files changed, 56 insertions(+), 11 deletions(-) diff --git a/mira/dkg/api.py b/mira/dkg/api.py index d811931c..6fa5638d 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -13,7 +13,7 @@ from mira.dkg.client import AskemEntity, Entity, Relation from mira.dkg.utils import DKG_REFINER_RELS -from mira.dkg.construct import process_resource +from mira.dkg.construct import add_resource_to_dkg __all__ = [ "api_blueprint", @@ -368,16 +368,17 @@ def add_relations( ) def add_resources( request: Request, - resource_list: List[str] = Body( + resource_prefix_list: List[str] = Body( ..., description="A of resources to add to the DKG", title="Resource Prefixes", - example=["probonto"], + example=["probonto", "wikidata", "eiffel", "geonames", "ncit", + "nbcbitaxon"], ) ): - for resource in resource_list: + for resource_prefix in resource_prefix_list: # nodes and edges will be a list of dicts - nodes, edges = process_resource(resource.lower()) + nodes, edges = add_resource_to_dkg(resource_prefix.lower()) # node_info and edge_info are dictionaries that will be # unpacked when creating instances of entities and relations entities = [Entity(**node_info) for node_info in nodes] diff --git a/mira/dkg/client.py b/mira/dkg/client.py index 5288fe76..f0929569 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -410,7 +410,6 @@ def add_relation(self, relation): self.create_tx(create_relation_query) - def create_single_property_node_index( self, index_name: str, diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 619eaa18..10212712 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -426,10 +426,55 @@ def extract_cso_nodes_edges(): def extract_wikidata_nodes_edges(): - pass + wikidata_nodes, wikidata_edges = [], [] + for wikidata_id, label, description, synonyms, xrefs in tqdm( + get_unit_terms(), unit="unit"): + synonyms_list = [Synonym(value=value, type="") for value in synonyms] + xrefs_list = [Xref(id=_id, type="oboinowl:hasDbXref") for _id in xrefs] + wikidata_nodes.append( + { + "id": f"wikidata:{wikidata_id}", + "name": label, + "type": "class", + "description": description, + "synonyms": synonyms_list, + "xrefs": xrefs_list, + "obsolete": False + } + ) + + for (wikidata_id, label, description, synonyms, xrefs, value, formula, + symbols) in tqdm(get_physical_constant_terms()): + synonym_types, synonym_values = [], [] + for syn in synonyms: + synonym_values.append(syn) + synonym_types.append("oboInOwl:hasExactSynonym") + for symbol in symbols: + synonym_values.append(symbol) + synonym_types.append("debio:0000031") + + synonyms_list = [Synonym(value=value, type=type) for value, type + in zip(synonym_values, synonym_types)] + xrefs_list = [Xref(id=_id, type="oboinowl:hasDbXref") for _id in xrefs] + if value: + properties = {"debio:0000042": [str(value)]} + else: + properties = {} + wikidata_nodes.append( + { + "id": f"wikidata:{wikidata_id}", + "name": label, + "obsolete": False, + "type": "class", + "description": description, + "synonyms": synonyms_list, + "xrefs": xrefs_list, + "properties": properties + } + ) -def process_resource(resource_prefix: str): +def add_resource_to_dkg(resource_prefix: str): if resource_prefix == "probonto": return extract_probonto_nodes_edges() elif resource_prefix == "geonames": @@ -443,6 +488,7 @@ def process_resource(resource_prefix: str): elif resource_prefix == "cso": return extract_cso_nodes_edges() elif resource_prefix == "wikidata": + # combine retrieval of wikidata constants and units return extract_wikidata_nodes_edges() diff --git a/mira/dkg/resources/cso.py b/mira/dkg/resources/cso.py index 2cde3264..aff995b6 100644 --- a/mira/dkg/resources/cso.py +++ b/mira/dkg/resources/cso.py @@ -23,10 +23,9 @@ def get_cso_obo() -> Obo: ) download(url=URL, path=PATH) # use https://github.com/pyobo/pyobo/pull/159 - kwargs = {"default_prefix": "cso"} - return from_obo_path(PATH, prefix="cso", strict=False, **kwargs) + return from_obo_path(PATH, prefix="cso", default_prefix="cso", strict=False) if __name__ == "__main__": for term in get_cso_obo(): - print(term) + print(term) \ No newline at end of file From 18ebfbfe8015a2ed45921a4286fcddbf323f9958 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Tue, 16 Jul 2024 13:40:43 -0400 Subject: [PATCH 07/21] Add return values for some extraction methods, refactor how synonyms and xrefs are processed for terms --- mira/dkg/client.py | 4 +- mira/dkg/construct.py | 148 ++++++++++++++++++++++---------------- mira/dkg/resources/cso.py | 2 +- 3 files changed, 89 insertions(+), 65 deletions(-) diff --git a/mira/dkg/client.py b/mira/dkg/client.py index f0929569..30e5dbf3 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -535,9 +535,7 @@ def get_grounder_terms(self, prefix: str) -> List["gilda.term.Term"]: def get_lexical(self) -> List[Entity]: """Get Lexical information for all entities.""" - query = (f"MATCH (n) WHERE NOT n.obsolete and EXISTS(n.name)" - f" and EXISTS(n.type) " - f"RETURN n") + query = f"MATCH (n) WHERE NOT n.obsolete and EXISTS(n.name) RETURN n" return [Entity.from_data(n) for n, in self.query_tx(query) or []] def get_grounder(self, prefix: Union[str, List[str]]) -> "gilda.grounder.Grounder": diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 10212712..e660d786 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -280,25 +280,29 @@ def extract_geonames_nodes_edges(): "id": term.curie, "name": term.name, "type": "individual", - "description": term.definition, + "description": term.definition if term.definition else "", "obsolete": False if not term.is_obsolete else True, - "synonyms": term.synonyms, + "synonyms": [Synonym(value=syn._fp(), + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.prefix}") + for syn in term.synonyms], "alts": term.alt_ids, - "xrefs": term.xrefs, + "xrefs": [Xref(value=value, type=type) for value, type in + zip(term.xrefs, term.xref_types)], "properties": term.properties, } ) for parent in term.get_relationships(part_of): geonames_edges.append( - ( - term.curie, - parent.curie, - "part_of", - part_of.curie.lower(), - "geonames", - "geonames", - "", - ) + { + "source_curie": term.curie, + "target_curie": parent.curie, + "type": "part_of", + "pred": part_of.curie.lower(), + "source": "geonames", + "graph": "geonames", + "version": "", + } ) return geonames_nodes, geonames_edges @@ -311,26 +315,31 @@ def extract_ncit_nodes_edges(): "id": term.curie, "name": term.name, "type": "class", - "description": term.definition, + "description": term.definition if term.definition else "", "obsolete": False if not term.is_obsolete else True, - "synonyms": term.synonyms, + "synonyms": [Synonym(value=syn._fp(), + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.prefix}") + for syn in term.synonyms], "alts": term.alt_ids, - "xrefs": term.xrefs, + "xrefs": [Xref(value=value, type=type) for value, type in + zip(term.xrefs, term.xref_types)], "properties": term.properties, } ) for parent in term.get_relationships(part_of): ncit_edges.append( - ( - term.curie, - parent.curie, - "part_of", - part_of.curie.lower(), - "ncit", - "ncit", - "", - ) + { + "source_curie": term.curie, + "target_curie": parent.curie, + "type": "part_of", + "pred": part_of.curie.lower(), + "source": "ncit", + "graph": "ncit", + "version": "", + } ) + return ncit_nodes, ncit_edges def extract_ncbitaxon_nodes_edges(): @@ -341,26 +350,31 @@ def extract_ncbitaxon_nodes_edges(): "id": term.curie, "name": term.name, "type": "class", - "description": term.definition, + "description": term.definition if term.definition else "", "obsolete": False if not term.is_obsolete else True, - "synonyms": term.synonyms, + "synonyms": [Synonym(value=syn._fp(), + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.prefix}") + for syn in term.synonyms], "alts": term.alt_ids, - "xrefs": term.xrefs, + "xrefs": [Xref(value=value, type=type) for value, type in + zip(term.xrefs, term.xref_types)], "properties": term.properties, } ) for parent in term.get_relationships(part_of): ncbitaxon_edges.append( - ( - term.curie, - parent.curie, - "part_of", - part_of.curie.lower(), - "ncbitaxon", - "ncbitaxon", - "", - ) + { + "source_curie": term.curie, + "target_curie": parent.curie, + "type": "part_of", + "pred": part_of.curie.lower(), + "source": "ncbitaxon", + "graph": "ncbitaxon", + "version": "", + } ) + return ncbitaxon_nodes, ncbitaxon_edges def extract_eiffel_nodes_edges(): @@ -371,26 +385,30 @@ def extract_eiffel_nodes_edges(): "id": term.curie, "name": term.name, "type": "class", - "description": term.definition, + "description": term.definition if term.definition else "", "obsolete": False if not term.is_obsolete else True, - "synonyms": term.synonyms, + "synonyms": [Synonym(value=syn._fp(), + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.prefix}") + for syn in term.synonyms], "alts": term.alt_ids, - "xrefs": term.xrefs, + "xrefs": [Xref(value=value, type=type) for value, type in + zip(term.xrefs, term.xref_types)], "properties": term.properties, } ) for typedef, object_references in term.relationships.items(): for object_reference in object_references: eiffel_edges.append( - ( - term.curie, - object_reference.curie, - typedef.name.replace(" ", "").lower(), - typedef.curie, - "eiffel", - "eiffel", - "", - ) + { + "source_curie": term.curie, + "target_curie": object_reference.curie, + "type": typedef.name.replace(" ", "").lower(), + "pred": typedef.curie, + "source": "eiffel", + "graph": "eiffel", + "version": "", + } ) return eiffel_nodes, eiffel_edges @@ -403,26 +421,31 @@ def extract_cso_nodes_edges(): "id": term.curie, "name": term.name, "type": "class", - "description": term.definition, + "description": term.definition if term.definition else "", "obsolete": False if not term.is_obsolete else True, - "synonyms": term.synonyms, + "synonyms": [Synonym(value=syn._fp(), + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.prefix}") + for syn in term.synonyms], "alts": term.alt_ids, - "xrefs": term.xrefs, + "xrefs": [Xref(value=value, type=type) for value, type in + zip(term.xrefs, term.xref_types)], "properties": term.properties, } ) for parent in term.get_relationship(part_of): cso_edges.append( - ( - term.curie, - parent.curie, - "part_of", - part_of.curie.lower(), - "cso", - "cso", - "", - ) + { + "source_curie": term.curie, + "target_curie": parent.curie, + "type": "part_of", + "pred": part_of.curie.lower(), + "source": "cso", + "graph": "cso", + "version": "", + } ) + return cso_nodes, cso_edges def extract_wikidata_nodes_edges(): @@ -472,6 +495,7 @@ def extract_wikidata_nodes_edges(): "properties": properties } ) + return wikidata_nodes, wikidata_edges def add_resource_to_dkg(resource_prefix: str): @@ -490,6 +514,9 @@ def add_resource_to_dkg(resource_prefix: str): elif resource_prefix == "wikidata": # combine retrieval of wikidata constants and units return extract_wikidata_nodes_edges() + else: + # handle resource names that we don't process + return [], [] @click.command() @@ -723,7 +750,6 @@ def construct( node_sources[term.curie].add("cso") nodes[term.curie] = get_node_info(term) - eiffel_edges = [] for term in tqdm(get_eiffel_ontology_terms(), unit="term", desc="Eiffel"): node_sources[term.curie].add("eiffel") diff --git a/mira/dkg/resources/cso.py b/mira/dkg/resources/cso.py index aff995b6..88e8d1da 100644 --- a/mira/dkg/resources/cso.py +++ b/mira/dkg/resources/cso.py @@ -28,4 +28,4 @@ def get_cso_obo() -> Obo: if __name__ == "__main__": for term in get_cso_obo(): - print(term) \ No newline at end of file + print(term) From a7aef3c328f320d7bdbf92374c8eefb02f14c3fe Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Tue, 16 Jul 2024 13:43:20 -0400 Subject: [PATCH 08/21] Revert error handling in construct.py --- mira/dkg/construct.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index e660d786..77a72408 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -573,14 +573,7 @@ def construct( edge_names = {} for edge_prefix in DEFAULT_VOCABS: click.secho(f"Caching {manager.get_name(edge_prefix)}", fg="green", bold=True) - try: - parse_results = bioontologies.get_obograph_by_prefix(edge_prefix) - except pydantic.error_wrappers.ValidationError: - print(f"VALIDATE NODE GRAPH ERROR {edge_prefix}") - continue - if not parse_results.graph_document: - print(f"EMPTY GRAPH {edge_prefix}") - continue + parse_results = bioontologies.get_obograph_by_prefix(edge_prefix) for edge_graph in parse_results.graph_document.graphs: edge_graph = edge_graph.standardize() for edge_node in edge_graph.nodes: @@ -1252,8 +1245,6 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: writer = csv.writer(file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) writer.writerow(EDGE_HEADER) for prefix, edge_path in tqdm(sorted(use_case_paths.EDGES_PATHS.items()), desc="cat edges"): - if not edge_path.is_file(): - continue with edge_path.open() as edge_file: reader = csv.reader(edge_file, delimiter="\t", quoting=csv.QUOTE_MINIMAL) _header = next(reader) From a709ae707754df5d391b7538e9591b7404be2f03 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Tue, 16 Jul 2024 15:49:19 -0400 Subject: [PATCH 09/21] Add correct node properties for a node to be added to the dkg --- mira/dkg/client.py | 64 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 17 deletions(-) diff --git a/mira/dkg/client.py b/mira/dkg/client.py index 30e5dbf3..cdb09001 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -171,6 +171,7 @@ def from_data(cls, data): ------- A MIRA entity """ + if isinstance(data, neo4j.graph.Node): data = dict(data.items()) properties = defaultdict(list) @@ -186,6 +187,8 @@ def from_data(cls, data): ): synonyms.append(Synonym(value=value, type=type)) xrefs = [] + + for curie, type in zip( data.pop("xrefs", []), data.pop("xref_types", []), @@ -344,40 +347,67 @@ def add_node(self, entity): entity: The node object that will be added to the DKG """ - curie = entity.id + xrefs, xref_types = [], [] + synonyms, synonym_types = [], [] + property_predicates, property_values = [], [] + for xref in entity.xrefs: + xrefs.append(xref.id) + xref_types.append(xref.type) + for synonym in entity.synonyms: + synonyms.append(synonym.value) + synonym_types.append(synonym.value) + for property_predicate, property_value_list in entity.properties.items(): + property_predicates.append(property_predicate) + property_values.extend(property_value_list) + + _id = entity.id name = entity.name type = entity.type obsolete = entity.obsolete description = entity.description - synonyms = entity.synonyms alts = entity.alts - xrefs = entity.xrefs labels = entity.labels - properties = entity.properties create_source_node_query = ( - "MERGE (n {curie: $curie, " - "name: $name, " + "MERGE (n {id: $id, " "type: $type, " - "obsolete: $obsolete, " - "description: $description, " - "synonyms: $synonyms, " - "alts: $alts, " - "xrefs: $xrefs, " - "labels: $labels, " - "properties: $properties})" + "obsolete: $obsolete" ) + + if name: + create_source_node_query += ", name: $name" + if description: + create_source_node_query += ", description: $description" + if alts: + create_source_node_query += ", alts: $alts" + if labels: + create_source_node_query += ", labels: $labels" + if xrefs: + create_source_node_query += ", xrefs: $xrefs" + create_source_node_query += ", xref_types: $xref_types" + if synonyms: + create_source_node_query += ", synonyms: $synonyms" + create_source_node_query += ", synonym_types: $synonym_types" + if property_predicates: + create_source_node_query += ", property_predicates: $property_predicates" + create_source_node_query += ", property_values: $property_values" + + create_source_node_query += "})" + query_parameters = { - "curie": curie, + "id": _id, "name": name, "type": type, "obsolete": obsolete, "description": description, - "synonyms": json.dumps([synonym.dict() for synonym in synonyms]), + "synonyms": synonyms, + "synonym_types": synonym_types, "alts": alts, - "xrefs": json.dumps([xref.dict() for xref in xrefs]), + "xrefs": xrefs, + "xref_types": xref_types, "labels": labels, - "properties": json.dumps(properties) + "property_predicates": property_predicates, + "property_values": property_values } self.create_tx(create_source_node_query, **query_parameters) From 87782d8db40152f7dccfe4ac99c0f5180e219acb Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Tue, 16 Jul 2024 16:20:55 -0400 Subject: [PATCH 10/21] Aggregate retrieval of pyobo resources into one method --- mira/dkg/construct.py | 247 +++++++++++------------------------------- 1 file changed, 65 insertions(+), 182 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 77a72408..aee21845 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -205,6 +205,60 @@ class NodeInfo(NamedTuple): synonym_types: str +def extract_nodes_edges_from_pyobo_terms(term_getter, resource_prefix): + nodes, edges = [], [] + if resource_prefix in {"geonames"}: + entity_type = "individual" + elif resource_prefix in {"ncit", "ncbitaxon", "eiffel", "cso"}: + entity_type = "class" + for term in tqdm(term_getter(), unit="term"): + nodes.append( + { + "id": term.curie, + "name": term.name, + "type": entity_type, + "description": term.definition if term.definition else "", + "obsolete": False if not term.is_obsolete else True, + "synonyms": [Synonym(value=syn._fp(), + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.prefix}") + for syn in term.synonyms], + "alts": term.alt_ids, + "xrefs": [Xref(value=value, type=type) for value, type in + zip(term.xrefs, term.xref_types)], + "properties": dict(term.properties), + } + ) + if resource_prefix != "eiffel": + for parent in term.get_relationships(part_of): + edges.append( + { + "source_curie": term.curie, + "target_curie": parent.curie, + "type": "part_of", + "pred": part_of.curie.lower(), + "source": resource_prefix, + "graph": resource_prefix, + "version": "", + } + ) + else: + for typedef, object_references in term.relationships.items(): + for object_reference in object_references: + edges.append( + { + "source_curie": term.curie, + "target_curie": object_reference.curie, + "type": typedef.name.replace(" ", "").lower(), + "pred": typedef.curie, + "source": "eiffel", + "graph": "eiffel", + "version": "", + } + ) + return nodes, edges + + def extract_probonto_nodes_edges(): probonto_nodes, probonto_edges = [], [] for term in tqdm(get_probonto_terms(), unit="term"): @@ -272,186 +326,10 @@ def extract_probonto_nodes_edges(): return probonto_nodes, probonto_edges -def extract_geonames_nodes_edges(): - geonames_nodes, geonames_edges = [], [] - for term in tqdm(get_geonames_terms(), unit="term"): - geonames_nodes.append( - { - "id": term.curie, - "name": term.name, - "type": "individual", - "description": term.definition if term.definition else "", - "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn._fp(), - type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.prefix}") - for syn in term.synonyms], - "alts": term.alt_ids, - "xrefs": [Xref(value=value, type=type) for value, type in - zip(term.xrefs, term.xref_types)], - "properties": term.properties, - } - ) - for parent in term.get_relationships(part_of): - geonames_edges.append( - { - "source_curie": term.curie, - "target_curie": parent.curie, - "type": "part_of", - "pred": part_of.curie.lower(), - "source": "geonames", - "graph": "geonames", - "version": "", - } - ) - return geonames_nodes, geonames_edges - - -def extract_ncit_nodes_edges(): - ncit_nodes, ncit_edges = [], [] - for term in tqdm(get_ncit_subset(), unit="term"): - ncit_nodes.append( - { - "id": term.curie, - "name": term.name, - "type": "class", - "description": term.definition if term.definition else "", - "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn._fp(), - type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.prefix}") - for syn in term.synonyms], - "alts": term.alt_ids, - "xrefs": [Xref(value=value, type=type) for value, type in - zip(term.xrefs, term.xref_types)], - "properties": term.properties, - } - ) - for parent in term.get_relationships(part_of): - ncit_edges.append( - { - "source_curie": term.curie, - "target_curie": parent.curie, - "type": "part_of", - "pred": part_of.curie.lower(), - "source": "ncit", - "graph": "ncit", - "version": "", - } - ) - return ncit_nodes, ncit_edges - - -def extract_ncbitaxon_nodes_edges(): - ncbitaxon_nodes, ncbitaxon_edges = [], [] - for term in tqdm(get_ncbitaxon(), unit="term"): - ncbitaxon_nodes.append( - { - "id": term.curie, - "name": term.name, - "type": "class", - "description": term.definition if term.definition else "", - "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn._fp(), - type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.prefix}") - for syn in term.synonyms], - "alts": term.alt_ids, - "xrefs": [Xref(value=value, type=type) for value, type in - zip(term.xrefs, term.xref_types)], - "properties": term.properties, - } - ) - for parent in term.get_relationships(part_of): - ncbitaxon_edges.append( - { - "source_curie": term.curie, - "target_curie": parent.curie, - "type": "part_of", - "pred": part_of.curie.lower(), - "source": "ncbitaxon", - "graph": "ncbitaxon", - "version": "", - } - ) - return ncbitaxon_nodes, ncbitaxon_edges - - -def extract_eiffel_nodes_edges(): - eiffel_nodes, eiffel_edges = [], [] - for term in tqdm(get_eiffel_ontology_terms(), unit="term"): - eiffel_nodes.append( - { - "id": term.curie, - "name": term.name, - "type": "class", - "description": term.definition if term.definition else "", - "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn._fp(), - type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.prefix}") - for syn in term.synonyms], - "alts": term.alt_ids, - "xrefs": [Xref(value=value, type=type) for value, type in - zip(term.xrefs, term.xref_types)], - "properties": term.properties, - } - ) - for typedef, object_references in term.relationships.items(): - for object_reference in object_references: - eiffel_edges.append( - { - "source_curie": term.curie, - "target_curie": object_reference.curie, - "type": typedef.name.replace(" ", "").lower(), - "pred": typedef.curie, - "source": "eiffel", - "graph": "eiffel", - "version": "", - } - ) - return eiffel_nodes, eiffel_edges - - -def extract_cso_nodes_edges(): - cso_nodes, cso_edges = [], [] - for term in get_cso_obo().iter_terms(): - cso_nodes.append( - { - "id": term.curie, - "name": term.name, - "type": "class", - "description": term.definition if term.definition else "", - "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn._fp(), - type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.prefix}") - for syn in term.synonyms], - "alts": term.alt_ids, - "xrefs": [Xref(value=value, type=type) for value, type in - zip(term.xrefs, term.xref_types)], - "properties": term.properties, - } - ) - for parent in term.get_relationship(part_of): - cso_edges.append( - { - "source_curie": term.curie, - "target_curie": parent.curie, - "type": "part_of", - "pred": part_of.curie.lower(), - "source": "cso", - "graph": "cso", - "version": "", - } - ) - return cso_nodes, cso_edges - - def extract_wikidata_nodes_edges(): wikidata_nodes, wikidata_edges = [], [] for wikidata_id, label, description, synonyms, xrefs in tqdm( - get_unit_terms(), unit="unit"): + get_unit_terms(), unit="unit"): synonyms_list = [Synonym(value=value, type="") for value in synonyms] xrefs_list = [Xref(id=_id, type="oboinowl:hasDbXref") for _id in xrefs] wikidata_nodes.append( @@ -502,15 +380,20 @@ def add_resource_to_dkg(resource_prefix: str): if resource_prefix == "probonto": return extract_probonto_nodes_edges() elif resource_prefix == "geonames": - return extract_geonames_nodes_edges() + return extract_nodes_edges_from_pyobo_terms(get_geonames_terms, + "geonames") elif resource_prefix == "ncit": - return extract_ncit_nodes_edges() + return extract_nodes_edges_from_pyobo_terms(get_ncit_subset, + "ncit") elif resource_prefix == "ncbitaxon": - return extract_ncbitaxon_nodes_edges() + return extract_nodes_edges_from_pyobo_terms(get_ncbitaxon, + "ncbitaxon") elif resource_prefix == "eiffel": - return extract_eiffel_nodes_edges() + return extract_nodes_edges_from_pyobo_terms( + get_eiffel_ontology_terms, "eiffel") elif resource_prefix == "cso": - return extract_cso_nodes_edges() + return extract_nodes_edges_from_pyobo_terms(get_cso_obo(), + "cso") elif resource_prefix == "wikidata": # combine retrieval of wikidata constants and units return extract_wikidata_nodes_edges() From 7525bd224e775358c8fa00e7454fa0811b3348f2 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Tue, 16 Jul 2024 16:33:22 -0400 Subject: [PATCH 11/21] Remove unused imports --- mira/dkg/client.py | 1 - mira/dkg/construct.py | 1 - 2 files changed, 2 deletions(-) diff --git a/mira/dkg/client.py b/mira/dkg/client.py index cdb09001..c5e699a7 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -3,7 +3,6 @@ import itertools as itt import logging import os -import json from collections import Counter, defaultdict from difflib import SequenceMatcher from functools import lru_cache diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index aee21845..65120296 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -33,7 +33,6 @@ import biomappings import bioontologies import click -import pydantic.error_wrappers import pyobo import pystow from bioontologies import obograph From 75b5d041087d707b74c80fc15212a2f64f6bccab Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 09:26:39 -0400 Subject: [PATCH 12/21] Add biomappings to dkg-construct extra and install dkg-construct for github tests --- .github/workflows/tests.yml | 1 + mira/dkg/api.py | 1 - setup.cfg | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4eaa3470..97a753e8 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,7 @@ jobs: sudo apt-get install graphviz libgraphviz-dev pip install --upgrade pip setuptools wheel pip install "tox<4.0.0" + pip install .[dkg-construct] - name: Test with pytest run: | export MIRA_REST_URL=http://34.230.33.149:8771 diff --git a/mira/dkg/api.py b/mira/dkg/api.py index 6fa5638d..ce06cb8d 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -390,7 +390,6 @@ def add_resources( request.app.state.client.add_relation(relation) - class IsOntChildResult(BaseModel): """Result of a query to /is_ontological_child""" diff --git a/setup.cfg b/setup.cfg index 7053d3a9..1ffc52d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -60,6 +60,7 @@ dkg-construct = pystow tabulate tqdm + biomappings dkg-embed = grape metaregistry = From 8afb529b96fc0eedb2018c7b1e1dc528ac9f9970 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 09:35:17 -0400 Subject: [PATCH 13/21] Verify biomappings installed on github tests --- .github/workflows/tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 97a753e8..faba09e1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,6 +21,10 @@ jobs: pip install --upgrade pip setuptools wheel pip install "tox<4.0.0" pip install .[dkg-construct] + - name: Verify installed packages + run: | + pip list + pip show biomappings - name: Test with pytest run: | export MIRA_REST_URL=http://34.230.33.149:8771 From 876c37e10c9f03ec7c9d1c4fd0c137f0301105b2 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 09:49:02 -0400 Subject: [PATCH 14/21] Add dkg-construct to tox.ini --- .github/workflows/tests.yml | 5 ----- tox.ini | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index faba09e1..4eaa3470 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,11 +20,6 @@ jobs: sudo apt-get install graphviz libgraphviz-dev pip install --upgrade pip setuptools wheel pip install "tox<4.0.0" - pip install .[dkg-construct] - - name: Verify installed packages - run: | - pip list - pip show biomappings - name: Test with pytest run: | export MIRA_REST_URL=http://34.230.33.149:8771 diff --git a/tox.ini b/tox.ini index fdadd46b..cd9e4e85 100644 --- a/tox.ini +++ b/tox.ini @@ -14,6 +14,7 @@ passenv = PYTHONPATH, MIRA_REST_URL extras = tests web + dkg-construct deps = anyio<4 commands = From 6c4417f9ba469ec61bc29f965bd8d9cfcff98df9 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 09:57:04 -0400 Subject: [PATCH 15/21] Add rdflib to dkg-construct extra in setup.cfg --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 1ffc52d9..ef7f299a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -61,6 +61,7 @@ dkg-construct = tabulate tqdm biomappings + rdflib dkg-embed = grape metaregistry = From 667494019482a2cb0387c29ed2f0b42811686cf5 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 10:06:30 -0400 Subject: [PATCH 16/21] Use backwards compatiable List type annotation for method header --- mira/dkg/resources/extract_eiffel_ontology.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mira/dkg/resources/extract_eiffel_ontology.py b/mira/dkg/resources/extract_eiffel_ontology.py index 36342aa5..5ab79d05 100644 --- a/mira/dkg/resources/extract_eiffel_ontology.py +++ b/mira/dkg/resources/extract_eiffel_ontology.py @@ -1,4 +1,6 @@ """Get terms from the eiffel climate ontology""" +from typing import List + import curies import pystow from curies import Converter @@ -378,7 +380,7 @@ def process_sdg_series(converter: curies.Converter): return curie_to_term -def get_eiffel_ontology_terms() -> list[Term]: +def get_eiffel_ontology_terms() -> List[Term]: converter = Converter.from_prefix_map( { "ecv": "http://purl.org/eiffo/ecv#", From 081145e39587c7256ef0134e1e00263f68387a2e Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 10:21:17 -0400 Subject: [PATCH 17/21] Pass get_cso_obo reference rather than call it --- mira/dkg/construct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 65120296..1c40a062 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -391,7 +391,7 @@ def add_resource_to_dkg(resource_prefix: str): return extract_nodes_edges_from_pyobo_terms( get_eiffel_ontology_terms, "eiffel") elif resource_prefix == "cso": - return extract_nodes_edges_from_pyobo_terms(get_cso_obo(), + return extract_nodes_edges_from_pyobo_terms(get_cso_obo, "cso") elif resource_prefix == "wikidata": # combine retrieval of wikidata constants and units From ba9b237700aa153261db605ec435abc5207f799d Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 11:34:35 -0400 Subject: [PATCH 18/21] Adjust node property attributes --- mira/dkg/client.py | 2 +- mira/dkg/construct.py | 4 ++-- mira/dkg/resources/cso.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mira/dkg/client.py b/mira/dkg/client.py index c5e699a7..5a55c7d7 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -354,7 +354,7 @@ def add_node(self, entity): xref_types.append(xref.type) for synonym in entity.synonyms: synonyms.append(synonym.value) - synonym_types.append(synonym.value) + synonym_types.append(synonym.type) for property_predicate, property_value_list in entity.properties.items(): property_predicates.append(property_predicate) property_values.extend(property_value_list) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 1c40a062..83003377 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -218,9 +218,9 @@ def extract_nodes_edges_from_pyobo_terms(term_getter, resource_prefix): "type": entity_type, "description": term.definition if term.definition else "", "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn._fp(), + "synonyms": [Synonym(value=syn.name, type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.prefix}") + f"{syn.type.reference.identifier}") for syn in term.synonyms], "alts": term.alt_ids, "xrefs": [Xref(value=value, type=type) for value, type in diff --git a/mira/dkg/resources/cso.py b/mira/dkg/resources/cso.py index 88e8d1da..490bced9 100644 --- a/mira/dkg/resources/cso.py +++ b/mira/dkg/resources/cso.py @@ -23,7 +23,7 @@ def get_cso_obo() -> Obo: ) download(url=URL, path=PATH) # use https://github.com/pyobo/pyobo/pull/159 - return from_obo_path(PATH, prefix="cso", default_prefix="cso", strict=False) + return from_obo_path(PATH, prefix="cso", strict=False) if __name__ == "__main__": From 8ba98f39b62a46ae7ae0c6e6853f0fb9821161b7 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 12:02:54 -0400 Subject: [PATCH 19/21] Change probonto xref_type --- mira/dkg/construct.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 83003377..2c25d2d8 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -250,8 +250,8 @@ def extract_nodes_edges_from_pyobo_terms(term_getter, resource_prefix): "target_curie": object_reference.curie, "type": typedef.name.replace(" ", "").lower(), "pred": typedef.curie, - "source": "eiffel", - "graph": "eiffel", + "source": resource_prefix, + "graph": resource_prefix, "version": "", } ) @@ -278,7 +278,7 @@ def extract_probonto_nodes_edges(): "type": "class", "description": "", "obsolete": False, - "xrefs": [Xref(id=eq.get("curie", ""), type=eq.get("name", "")) + "xrefs": [Xref(id=eq.get("curie", ""), type="askemo:0000016") for eq in term.get("equivalent", [])], "properties": properties From 255b94b6ac28ce9673cf26af5731eef14daaf9db Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 16:35:56 -0400 Subject: [PATCH 20/21] Add dkg-construct dependency to Docker version of mira, use id field of nodes rather than curie for adding relations --- docker/Dockerfile | 2 +- mira/dkg/api.py | 2 ++ mira/dkg/client.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7254d906..6bdfa9d6 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -31,7 +31,7 @@ RUN wget -O /sw/nodes.tsv.gz https://askem-mira.s3.amazonaws.com/dkg/$domain/bui # Python packages RUN python -m pip install --upgrade pip && \ - python -m pip install git+https://github.com/gyorilab/mira.git@main#egg=mira[web,uvicorn,dkg-client] && \ + python -m pip install git+https://github.com/gyorilab/mira.git@main#egg=mira[web,uvicorn,dkg-client,dkg-construct] && \ python -m pip uninstall -y flask_bootstrap && \ python -m pip uninstall -y bootstrap_flask && \ python -m pip install bootstrap_flask && \ diff --git a/mira/dkg/api.py b/mira/dkg/api.py index ce06cb8d..21f92d94 100644 --- a/mira/dkg/api.py +++ b/mira/dkg/api.py @@ -376,6 +376,8 @@ def add_resources( "nbcbitaxon"], ) ): + """From a list of resource prefixes, add a list of nodes and edges + extract from each resource to the DKG""" for resource_prefix in resource_prefix_list: # nodes and edges will be a list of dicts nodes, edges = add_resource_to_dkg(resource_prefix.lower()) diff --git a/mira/dkg/client.py b/mira/dkg/client.py index 5a55c7d7..92999c7a 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -428,8 +428,8 @@ def add_relation(self, relation): graph = relation.graph create_relation_query = ( - f"MATCH (source_node {{curie: '{source_curie}'}}), " - f"(target_node {{curie: '{target_curie}'}}) " + f"MATCH (source_node {{id: '{source_curie}'}}), " + f"(target_node {{id: '{target_curie}'}}) " f"MERGE (source_node)-[rel:{type}]->(target_node)" f"SET rel.pred = '{pred}'" f"SET rel.source = '{source}'" From 13952484db73ffc7b0f5b441b8d2fcd0357a3d48 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 17 Jul 2024 17:50:58 -0400 Subject: [PATCH 21/21] Use mira defined Xref object model for nodes and process ncbitaxon nodes differently, add demo notebook --- mira/dkg/construct.py | 55 +++- notebooks/Extend_DKG_demo.ipynb | 558 ++++++++++++++++++++++++++++++++ 2 files changed, 596 insertions(+), 17 deletions(-) create mode 100644 notebooks/Extend_DKG_demo.ipynb diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 2c25d2d8..2b9a54a1 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -211,23 +211,44 @@ def extract_nodes_edges_from_pyobo_terms(term_getter, resource_prefix): elif resource_prefix in {"ncit", "ncbitaxon", "eiffel", "cso"}: entity_type = "class" for term in tqdm(term_getter(), unit="term"): - nodes.append( - { - "id": term.curie, - "name": term.name, - "type": entity_type, - "description": term.definition if term.definition else "", - "obsolete": False if not term.is_obsolete else True, - "synonyms": [Synonym(value=syn.name, - type=f"{syn.type.reference.prefix}:" - f"{syn.type.reference.identifier}") - for syn in term.synonyms], - "alts": term.alt_ids, - "xrefs": [Xref(value=value, type=type) for value, type in - zip(term.xrefs, term.xref_types)], - "properties": dict(term.properties), - } - ) + if resource_prefix != "ncbitaxon": + nodes.append( + { + "id": term.curie, + "name": term.name, + "type": entity_type, + "description": term.definition if term.definition else "", + "obsolete": False if not term.is_obsolete else True, + "synonyms": [Synonym(value=syn.name, + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.identifier}") + for syn in term.synonyms], + "alts": term.alt_ids, + "xrefs": [Xref(id=_id, type=type) for _id, type in + zip(term.xrefs, term.xref_types)], + "properties": dict(term.properties), + } + ) + else: + nodes.append( + { + "id": term.curie, + "name": term.name, + "type": entity_type, + "description": term.definition if term.definition else "", + "obsolete": False if not term.is_obsolete else True, + "synonyms": [Synonym(value=syn.name, + type=f"{syn.type.reference.prefix}:" + f"{syn.type.reference.identifier}") + for syn in term.synonyms], + "alts": [f"{reference.prefix}:{reference.identifier}" for + reference in term.alt_ids], + "xrefs": [Xref(id=f"{reference.prefix}:" + f"{reference.identifier}", type="") + for reference in term.xrefs], + "properties": dict(term.properties), + } + ) if resource_prefix != "eiffel": for parent in term.get_relationships(part_of): edges.append( diff --git a/notebooks/Extend_DKG_demo.ipynb b/notebooks/Extend_DKG_demo.ipynb new file mode 100644 index 00000000..71e4ecfb --- /dev/null +++ b/notebooks/Extend_DKG_demo.ipynb @@ -0,0 +1,558 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a3f4694c-6dfd-438c-ba7f-25571f8b6db0", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from mira.dkg.client import Neo4jClient" + ] + }, + { + "cell_type": "markdown", + "id": "9750dc65-e30d-43a9-acbf-5a661961e5ed", + "metadata": {}, + "source": [ + "## We define an instance of the Neo4jClient to query for the nodes and edges added to the DKG" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a69c02f1-284b-4248-bddc-b10cc5ed87cd", + "metadata": {}, + "outputs": [], + "source": [ + "client = Neo4jClient()" + ] + }, + { + "cell_type": "markdown", + "id": "780f5a69-c4cd-4130-9547-d9974a0f53ec", + "metadata": {}, + "source": [ + "# We define sample nodes to be added to the DKG" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6cdf0570-f177-4adf-a446-2216bd26089e", + "metadata": {}, + "outputs": [], + "source": [ + "node_list = []\n", + "\n", + "node1 = {\n", + " \"id\": \"ido:0000511\",\n", + " \"name\": \"infected population\",\n", + " \"type\": \"class\",\n", + " \"obsolete\": False,\n", + " \"description\": \"An organism population whose members have an infection.\",\n", + " \"synonyms\": [],\n", + " \"alts\": [],\n", + " \"xrefs\": [],\n", + " \"labels\": [\"ido\"],\n", + " \"properties\": {},\n", + " \"link\": \"string\",\n", + " \"physical_min\": 0,\n", + " \"physical_max\": 0,\n", + " \"suggested_data_type\": \"string\",\n", + " \"suggested_unit\": \"string\",\n", + " \"typical_min\": 0,\n", + " \"typical_max\": 0,\n", + "}\n", + "\n", + "node2 = {\n", + " \"id\": \"ido:0000514\",\n", + " \"name\": \"susceptible population\",\n", + " \"type\": \"class\",\n", + " \"obsolete\": False,\n", + " \"description\": \"An organism population whose members are not infected with an infectious agent and who lack immunity to the infectious agent.\",\n", + " \"synonyms\": [],\n", + " \"alts\": [],\n", + " \"xrefs\": [],\n", + " \"labels\": [\"ido\"],\n", + " \"properties\": {},\n", + " \"link\": \"string\",\n", + "}\n", + "\n", + "\n", + "# Define a fully instantiated node\n", + "node3 = {\n", + " \"id\": \"ido:0000511\",\n", + " \"name\": \"infected population\",\n", + " \"type\": \"class\",\n", + " \"obsolete\": False,\n", + " \"description\": \"An organism population whose members have an infection.\",\n", + " \"synonyms\": [{\"value\": \"infected pop\", \"type\": \"skos:exactMatch\"}],\n", + " \"alts\": [\"ido:0000511-alt1\", \"ido:0000511-alt2\"],\n", + " \"xrefs\": [\n", + " {\"id\": \"xref:0001\", \"type\": \"skos:exactMatch\"},\n", + " {\"id\": \"xref:0002\", \"type\": \"skos:exactMatch\"},\n", + " ],\n", + " \"labels\": [\"ido\", \"population\"],\n", + " \"properties\": {\"property1\": [\"value1\"], \"property2\": [\"value3\"]},\n", + "}\n", + "\n", + "\n", + "node_list.append(node1)\n", + "node_list.append(node2)" + ] + }, + { + "cell_type": "markdown", + "id": "60fcebd4-af01-46ae-a6ce-25b70f0f6bf5", + "metadata": {}, + "source": [ + "## Test the add_nodes endpoint \n", + "\n", + "We first test the ```add_nodes``` endpoint that takes in a list of ```Entity``` and ```AskemEntity``` objects and adds them as nodes to the DKG. Only the ```id```, ```obsolete```, and ```type``` properties are mandatory." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2c3452bf-5bc3-43ca-9623-e8860fc7e0a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = requests.post(\"http://localhost:8771/api/add_nodes\", json=node_list)\n", + "response.status_code" + ] + }, + { + "cell_type": "markdown", + "id": "88a24f10-f276-45e9-a12f-7b24a605f1ba", + "metadata": {}, + "source": [ + "## We query for the added nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0a0351be-2553-460e-974b-b32777a4c8a5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'infected population', 'obsolete': False, 'description': 'An organism population whose members have an infection.', 'id': 'ido:0000511', 'type': 'class', 'labels': ['ido']}\n" + ] + } + ], + "source": [ + "query = f\"MATCH (N) WHERE N.id = 'ido:0000511' RETURN N\"\n", + "print(client.query_tx(query)[0][0]._properties)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9be0bc56-c4b5-447e-a2ce-2a00b1697e2a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'susceptible population', 'obsolete': False, 'description': 'An organism population whose members are not infected with an infectious agent and who lack immunity to the infectious agent.', 'id': 'ido:0000514', 'type': 'class', 'labels': ['ido']}\n" + ] + } + ], + "source": [ + "query = f\"MATCH (N) WHERE N.id = 'ido:0000514' RETURN N\"\n", + "print(client.query_tx(query)[0][0]._properties)" + ] + }, + { + "cell_type": "markdown", + "id": "4e4197b8-4e33-483d-b009-cad060a615d9", + "metadata": {}, + "source": [ + "## Add a fully instantiated node to the DKG\n", + "We then add a node with all of its properties supplied. Duplicate nodes (all properties must be matching for a node to be considered duplicate) are not added to the DKG." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b9d7017d-339d-4f7f-930b-1348ddd5f593", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "node_list.append(node3)\n", + "node_list.append(node1)\n", + "response = requests.post(\"http://localhost:8771/api/add_nodes\", json=node_list)\n", + "response.status_code" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "10b4c035-69f9-4df7-b3bd-bf3e0d1fa9ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'infected population', 'obsolete': False, 'description': 'An organism population whose members have an infection.', 'id': 'ido:0000511', 'type': 'class', 'labels': ['ido']}\n", + "\n", + "{'xref_types': ['skos:exactMatch', 'skos:exactMatch'], 'synonyms': ['infected pop'], 'alts': ['ido:0000511-alt1', 'ido:0000511-alt2'], 'xrefs': ['xref:0001', 'xref:0002'], 'obsolete': False, 'description': 'An organism population whose members have an infection.', 'type': 'class', 'labels': ['ido', 'population'], 'synonym_types': ['skos:exactMatch'], 'property_values': ['value1', 'value3'], 'property_predicates': ['property1', 'property2'], 'name': 'infected population', 'id': 'ido:0000511'}\n", + "\n" + ] + } + ], + "source": [ + "# We have two node objects returned from the query both with id ido:0000511 even though \n", + "# we used the add_nodes endpoint to add a node with ```id=ido:0000511``` three times\n", + "\n", + "query = f\"MATCH (N) WHERE N.id = 'ido:0000511' RETURN N\"\n", + "for n in client.query_tx(query):\n", + " print(n[0]._properties)\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "id": "c94627a4-cee9-4d7a-ac9b-ebf36cccca30", + "metadata": {}, + "source": [ + "# Test the add_relations endpoint\n", + "The ```add_relations``` endpoint takes in a list of ```Relation``` objects and adds the relation to the DKG. All properties of the relation are required. Duplicate relations are not added." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cfd152e2-b3af-428c-b264-f40c6d15825f", + "metadata": {}, + "outputs": [], + "source": [ + "relation_list = [\n", + " {\n", + " \"source_curie\": \"probonto:k0000000\",\n", + " \"target_curie\": \"probonto:k0000007\",\n", + " \"type\": \"has_parameter\",\n", + " \"pred\": \"probonto:c0000062\",\n", + " \"source\": \"probonto\",\n", + " \"graph\": \"https://raw.githubusercontent.com/probonto/ontology/master/probonto4ols.owl\",\n", + " \"version\": \"2.5\",\n", + " },\n", + " {\n", + " \"source_curie\": \"geonames:12042053\",\n", + " \"target_curie\": \"geonames:292969\",\n", + " \"type\": \"part_of\",\n", + " \"pred\": \"bfo:0000050\",\n", + " \"source\": \"geonames\",\n", + " \"graph\": \"geonames\",\n", + " \"version\": \"\",\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bb5c11cb-75b7-429e-a5b9-64af0e4a866c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "response = requests.post(\n", + " \"http://localhost:8771/api/add_relations\", json=relation_list\n", + ")\n", + "response.status_code" + ] + }, + { + "cell_type": "markdown", + "id": "53cc6281-ff93-40f4-8b55-515c9a4c6de1", + "metadata": {}, + "source": [ + "# We query for the added relations" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "973f3bed-665c-4aba-829c-9ea5e6258487", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Node : {'property_values': ['probability'], 'xref_types': ['askemo:0000016'], 'property_predicates': ['has_parameter'], 'name': 'Bernoulli1', 'obsolete': False, 'xrefs': ['probonto:k0000028'], 'id': 'probonto:k0000000', 'source': 'probonto', 'type': 'class', 'version': '2.5'} \n", + "\n", + "Relation : {'pred': 'probonto:c0000062', 'source': 'probonto', 'version': '2.5', 'graph': 'https://raw.githubusercontent.com/probonto/ontology/master/probonto4ols.owl'} \n", + "\n", + "Target Node : {'synonym_types': ['referenced_by_latex', 'oboInOwl:hasExactSynonym'], 'synonyms': ['p', 'probability of success'], 'name': 'probability', 'obsolete': False, 'id': 'probonto:k0000007', 'type': 'class', 'version': '2.5'} \n", + "\n" + ] + } + ], + "source": [ + "source_curie = \"probonto:k0000000\"\n", + "target_curie = \"probonto:k0000007\"\n", + "rel_type = \"has_parameter\"\n", + "\n", + "relation_query = f\"MATCH (source_node {{id: '{source_curie}'}}), (target_node {{id: '{target_curie}'}}) MATCH (source_node)-[rel:{rel_type}]->(target_node) RETURN source_node, rel, target_node\"\n", + "\n", + "result = client.query_tx(relation_query)\n", + "\n", + "print(f\"Source Node : {result[0][0]._properties} \\n\")\n", + "print(f\"Relation : {result[0][1]._properties} \\n\")\n", + "print(f\"Target Node : {result[0][2]._properties} \\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b8333206-4b12-44ad-b180-d1e63670111f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Source Node : ._properties \n", + "\n", + "Relation : {'pred': 'bfo:0000050', 'source': 'geonames', 'version': '', 'graph': 'geonames'} \n", + "\n", + "Target Node : {'property_values': ['AE.01'], 'property_predicates': ['code'], 'name': 'Abu Dhabi', 'obsolete': False, 'id': 'geonames:292969', 'type': 'individual'} \n", + "\n" + ] + } + ], + "source": [ + "source_curie = \"geonames:12042053\"\n", + "target_curie = \"geonames:292969\"\n", + "rel_type = \"part_of\"\n", + "\n", + "relation_query = f\"MATCH (source_node {{id: '{source_curie}'}}), (target_node {{id: '{target_curie}'}}) MATCH (source_node)-[rel:{rel_type}]->(target_node) RETURN source_node, rel, target_node\"\n", + "\n", + "result = client.query_tx(relation_query)\n", + "\n", + "print(f\"Source Node : {result[0][0]}._properties \\n\")\n", + "print(f\"Relation : {result[0][1]._properties} \\n\")\n", + "print(f\"Target Node : {result[0][2]._properties} \\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "091a04f3-a2e0-4113-9fc8-272dc99e688f", + "metadata": {}, + "source": [ + "# Test the add_resources endpoint\n", + "The ```add_resources``` endpoint accepts a list of strings that represent resource prefixes. Nodes and edges are extracted from each resource and then added to the DKG. The resources that can be added are ```eiffel, cso, wikidata, probonto, ncit, ncbitaxon, geonames```. The names are not case-sensitive and invalid resource prefixes are ignored. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c00ac8c4-59d5-4533-a95a-17c6def957b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "200" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "resource_list = [\n", + " \"probonto\",\n", + " \"wikidata\",\n", + " \"eiffel\",\n", + " \"geonames\",\n", + " \"ncit\",\n", + " \"nbcbitaxon\",\n", + " \"cso\",\n", + "]\n", + "\n", + "response = requests.post(\n", + " \"http://localhost:8771/api/add_resources\", json=resource_list\n", + ")\n", + "response.status_code" + ] + }, + { + "cell_type": "markdown", + "id": "4b646cdc-5f60-4ed2-8dfe-e510bd8469d5", + "metadata": {}, + "source": [ + "# We then query for some of the added nodes from the resources processed" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a66a111c-5aac-485c-b77d-49d856b693cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'synonym_types': ['debio:0000031'],\n", + " 'property_values': ['0.01438776877'],\n", + " 'xref_types': ['oboinowl:hasDbXref'],\n", + " 'synonyms': ['c_{2}'],\n", + " 'property_predicates': ['debio:0000042'],\n", + " 'xrefs': ['nist.codata:c22ndrc'],\n", + " 'name': 'second radiation constant',\n", + " 'obsolete': False,\n", + " 'description': \"constant in Wien's radiation law\",\n", + " 'id': 'wikidata:Q112300321',\n", + " 'type': 'class'}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = f\"MATCH (N) WHERE N.id = 'wikidata:Q112300321' RETURN N\"\n", + "client.query_tx(query)[0][0]._properties" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1194936e-5c93-4d8b-887f-a2c4395dc599", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'synonym_types': ['oboinowl:SynonymType',\n", + " 'oboinowl:SynonymType',\n", + " 'oboinowl:SynonymType',\n", + " 'oboinowl:SynonymType',\n", + " 'oboinowl:SynonymType'],\n", + " 'synonyms': ['Musaffa', 'Musaffah City', 'msfh', 'Мусаффа', 'مصفح'],\n", + " 'name': 'Musaffah',\n", + " 'obsolete': False,\n", + " 'id': 'geonames:12042053',\n", + " 'type': 'individual'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = f\"MATCH (N) WHERE N.id = 'geonames:12042053' RETURN N\"\n", + "client.query_tx(query)[0][0]._properties" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ae1442ef-7558-4b47-8bd5-0c87b309102a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'synonym_types': ['oboinowl:SynonymType', 'oboinowl:SynonymType'],\n", + " 'property_values': ['C123547',\n", + " 'Eukaryote',\n", + " 'Plasmodium falciparum',\n", + " 'C0032150',\n", + " 'CDISC',\n", + " 'Any unicellular, eukaryotic organism that can be assigned to the species Plasmodium falciparum.',\n", + " '5833'],\n", + " 'synonyms': ['PLASMODIUM FALCIPARUM', 'Plasmodium falciparum'],\n", + " 'property_predicates': ['NCIT:NHC0',\n", + " 'NCIT:P106',\n", + " 'NCIT:P108',\n", + " 'NCIT:P207',\n", + " 'NCIT:P322',\n", + " 'NCIT:P325',\n", + " 'NCIT:P331'],\n", + " 'name': 'Plasmodium falciparum',\n", + " 'obsolete': False,\n", + " 'description': 'A protozoan parasite in the family Plasmodiidae. P. falciparum is transmitted by the female Anopheles mosquito and is a causative agent of malaria in humans. The malaria caused by this species is the most dangerous form of malaria.',\n", + " 'id': 'ncit:C123547',\n", + " 'type': 'class'}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = f\"MATCH (N) WHERE N.id = 'ncit:C123547' RETURN N\"\n", + "client.query_tx(query)[0][0]._properties" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}