From 7e3c7289d78f7f097f199e43d1fb903604a1df30 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Wed, 25 Sep 2024 14:52:00 -0400 Subject: [PATCH 1/4] Use obograph.Xref instance and remove prefixes that don't have graph data --- mira/dkg/construct.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 3b8b04ae..35c4f2e1 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -985,6 +985,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: if parse_results.graph_document is None: click.secho(f"No graphs in {prefix}, skipping", fg="red") + use_case_paths.EDGES_PATHS.pop(prefix) continue _graphs = parse_results.graph_document.graphs @@ -1104,7 +1105,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: if add_xref_edges: for xref in node.xrefs: - if not isinstance(xref, Xref): + if not isinstance(xref, obograph.Xref): raise TypeError(f"Invalid type: {type(xref)}: {xref}") if not xref.value: continue From d12d1bdb046479a642a60dff6cbb688fba3220fb Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Thu, 26 Sep 2024 09:56:48 -0400 Subject: [PATCH 2/4] Check for stripped normalized text --- mira/dkg/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mira/dkg/client.py b/mira/dkg/client.py index 972a84c7..43b1b87d 100644 --- a/mira/dkg/client.py +++ b/mira/dkg/client.py @@ -803,7 +803,7 @@ def get_terms( ) for synonym in synonyms or []: norm_text = normalize(synonym) - if norm_text: + if norm_text.strip(): yield Term( norm_text=norm_text, text=synonym, From 30875a79132d1b51440a2e7ecc92d6ebc47bbcc4 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Fri, 27 Sep 2024 11:23:42 -0400 Subject: [PATCH 3/4] Perform admin region processing outside of loop that processes cities --- mira/dkg/resources/geonames.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mira/dkg/resources/geonames.py b/mira/dkg/resources/geonames.py index 1f25613a..542c2b9c 100644 --- a/mira/dkg/resources/geonames.py +++ b/mira/dkg/resources/geonames.py @@ -109,11 +109,16 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul ), ) + cities_df = cities_df[cities_df.population.astype(int) > minimum_population] cities_df.synonyms = cities_df.synonyms.str.split(",") terms = {} for term in code_to_country.values(): terms[term.identifier] = term + for term in code_to_admin1.values(): + terms[term.identifier] = term + for term in code_to_admin2.values(): + terms[term.identifier] = term cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "population"] for identifier, name, synonyms, country, admin1, admin2, population in (cities_df[cols].values): @@ -131,8 +136,6 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul print("could not find admin1", admin1_full) continue - terms[admin1_term.identifier] = admin1_term - if pd.notna(admin2): admin2_full = f"{country}.{admin1}.{admin2}" admin2_term = code_to_admin2.get(admin2_full) @@ -141,15 +144,9 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul # print("could not find admin2", admin2_full) else: term.append_relationship(part_of, admin2_term) - terms[admin2_term.identifier] = admin2_term else: # pd.notna(admin1): # If there's no admin 2, just annotate directly onto admin 1 term.append_relationship(part_of, admin1_term) - # We skip cities that don't meet the minimum population requirement - if int(population) < minimum_population: - continue - terms[identifier] = term = Term.from_triple("geonames", identifier, - name) return terms From 7346375929f44b23ddc3c2d9072e659446e35387 Mon Sep 17 00:00:00 2001 From: nanglo123 Date: Mon, 30 Sep 2024 09:58:22 -0400 Subject: [PATCH 4/4] Readd city term processing and don't add duplicate xref edges --- mira/dkg/construct.py | 6 +++--- mira/dkg/resources/geonames.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py index 35c4f2e1..c136f770 100644 --- a/mira/dkg/construct.py +++ b/mira/dkg/construct.py @@ -1112,8 +1112,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: if xref.value.prefix in obograph.PROVENANCE_PREFIXES: # Don't add provenance information as xrefs continue - edges.append( - ( + xref_edge_info = ( node.curie, xref.value.curie, "xref", @@ -1122,7 +1121,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str: graph_id, version or "", ) - ) + if xref_edge_info not in edges: + edges.append(xref_edge_info) if xref.value.curie not in nodes: node_sources[node.replaced_by].add(prefix) nodes[xref.value.curie] = NodeInfo( diff --git a/mira/dkg/resources/geonames.py b/mira/dkg/resources/geonames.py index 542c2b9c..2b782158 100644 --- a/mira/dkg/resources/geonames.py +++ b/mira/dkg/resources/geonames.py @@ -122,6 +122,7 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul cols = ["geonames_id", "name", "synonyms", "country_code", "admin1", "admin2", "population"] for identifier, name, synonyms, country, admin1, admin2, population in (cities_df[cols].values): + terms[identifier] = term = Term.from_triple("geonames", identifier,name) if synonyms and not isinstance(synonyms, float): for synoynm in synonyms: term.append_synonym(synoynm)