Merge pull request #376 from nanglo123/fix_dkg_construct

Fix the epidemology DKG construction pipeline and docker container startup
gyorilab · Oct 1, 2024 · 7a9b2e4 · 7a9b2e4
2 parents bd7095d + 7346375
commit 7a9b2e4
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 13 deletions.
diff --git a/mira/dkg/client.py b/mira/dkg/client.py
@@ -803,7 +803,7 @@ def get_terms(
         )
     for synonym in synonyms or []:
         norm_text = normalize(synonym)
-        if norm_text:
+        if norm_text.strip():
             yield Term(
                 norm_text=norm_text,
                 text=synonym,

diff --git a/mira/dkg/construct.py b/mira/dkg/construct.py
@@ -985,6 +985,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
 
         if parse_results.graph_document is None:
             click.secho(f"No graphs in {prefix}, skipping", fg="red")
+            use_case_paths.EDGES_PATHS.pop(prefix)
             continue
 
         _graphs = parse_results.graph_document.graphs
@@ -1104,15 +1105,14 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
 
                 if add_xref_edges:
                     for xref in node.xrefs:
-                        if not isinstance(xref, Xref):
+                        if not isinstance(xref, obograph.Xref):
                             raise TypeError(f"Invalid type: {type(xref)}: {xref}")
                         if not xref.value:
                             continue
                         if xref.value.prefix in obograph.PROVENANCE_PREFIXES:
                             # Don't add provenance information as xrefs
                             continue
-                        edges.append(
-                            (
+                        xref_edge_info = (
                                 node.curie,
                                 xref.value.curie,
                                 "xref",
@@ -1121,7 +1121,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
                                 graph_id,
                                 version or "",
                             )
-                        )
+                        if xref_edge_info not in edges:
+                            edges.append(xref_edge_info)
                         if xref.value.curie not in nodes:
                             node_sources[node.replaced_by].add(prefix)
                             nodes[xref.value.curie] = NodeInfo(

diff --git a/mira/dkg/resources/geonames.py b/mira/dkg/resources/geonames.py
@@ -109,14 +109,20 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul
         ),
     )
 
+    cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
     cities_df.synonyms = cities_df.synonyms.str.split(",")
 
     terms = {}
     for term in code_to_country.values():
         terms[term.identifier] = term
+    for term in code_to_admin1.values():
+        terms[term.identifier] = term
+    for term in code_to_admin2.values():
+        terms[term.identifier] = term
     cols = ["geonames_id", "name", "synonyms", "country_code", "admin1",
             "admin2", "population"]
     for identifier, name, synonyms, country, admin1, admin2, population in (cities_df[cols].values):
+        terms[identifier] = term = Term.from_triple("geonames", identifier,name)
         if synonyms and not isinstance(synonyms, float):
             for synoynm in synonyms:
                 term.append_synonym(synoynm)
@@ -131,8 +137,6 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul
             print("could not find admin1", admin1_full)
             continue
 
-        terms[admin1_term.identifier] = admin1_term
-
         if pd.notna(admin2):
             admin2_full = f"{country}.{admin1}.{admin2}"
             admin2_term = code_to_admin2.get(admin2_full)
@@ -141,15 +145,9 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul
                 # print("could not find admin2", admin2_full)
             else:
                 term.append_relationship(part_of, admin2_term)
-                terms[admin2_term.identifier] = admin2_term
 
         else:  # pd.notna(admin1):
             # If there's no admin 2, just annotate directly onto admin 1
             term.append_relationship(part_of, admin1_term)
 
-        # We skip cities that don't meet the minimum population requirement
-        if int(population) < minimum_population:
-            continue
-        terms[identifier] = term = Term.from_triple("geonames", identifier,
-                                                    name)
     return terms