Skip to content

Commit

Permalink
Merge pull request #376 from nanglo123/fix_dkg_construct
Browse files Browse the repository at this point in the history
Fix the epidemology DKG construction pipeline and docker container startup
  • Loading branch information
bgyori authored Oct 1, 2024
2 parents bd7095d + 7346375 commit 7a9b2e4
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 13 deletions.
2 changes: 1 addition & 1 deletion mira/dkg/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ def get_terms(
)
for synonym in synonyms or []:
norm_text = normalize(synonym)
if norm_text:
if norm_text.strip():
yield Term(
norm_text=norm_text,
text=synonym,
Expand Down
9 changes: 5 additions & 4 deletions mira/dkg/construct.py
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,7 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:

if parse_results.graph_document is None:
click.secho(f"No graphs in {prefix}, skipping", fg="red")
use_case_paths.EDGES_PATHS.pop(prefix)
continue

_graphs = parse_results.graph_document.graphs
Expand Down Expand Up @@ -1104,15 +1105,14 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:

if add_xref_edges:
for xref in node.xrefs:
if not isinstance(xref, Xref):
if not isinstance(xref, obograph.Xref):
raise TypeError(f"Invalid type: {type(xref)}: {xref}")
if not xref.value:
continue
if xref.value.prefix in obograph.PROVENANCE_PREFIXES:
# Don't add provenance information as xrefs
continue
edges.append(
(
xref_edge_info = (
node.curie,
xref.value.curie,
"xref",
Expand All @@ -1121,7 +1121,8 @@ def _get_edge_name(curie_: str, strict: bool = False) -> str:
graph_id,
version or "",
)
)
if xref_edge_info not in edges:
edges.append(xref_edge_info)
if xref.value.curie not in nodes:
node_sources[node.replaced_by].add(prefix)
nodes[xref.value.curie] = NodeInfo(
Expand Down
14 changes: 6 additions & 8 deletions mira/dkg/resources/geonames.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,20 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul
),
)

cities_df = cities_df[cities_df.population.astype(int) > minimum_population]
cities_df.synonyms = cities_df.synonyms.str.split(",")

terms = {}
for term in code_to_country.values():
terms[term.identifier] = term
for term in code_to_admin1.values():
terms[term.identifier] = term
for term in code_to_admin2.values():
terms[term.identifier] = term
cols = ["geonames_id", "name", "synonyms", "country_code", "admin1",
"admin2", "population"]
for identifier, name, synonyms, country, admin1, admin2, population in (cities_df[cols].values):
terms[identifier] = term = Term.from_triple("geonames", identifier,name)
if synonyms and not isinstance(synonyms, float):
for synoynm in synonyms:
term.append_synonym(synoynm)
Expand All @@ -131,8 +137,6 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul
print("could not find admin1", admin1_full)
continue

terms[admin1_term.identifier] = admin1_term

if pd.notna(admin2):
admin2_full = f"{country}.{admin1}.{admin2}"
admin2_term = code_to_admin2.get(admin2_full)
Expand All @@ -141,15 +145,9 @@ def get_cities(code_to_country, code_to_admin1, code_to_admin2, *, minimum_popul
# print("could not find admin2", admin2_full)
else:
term.append_relationship(part_of, admin2_term)
terms[admin2_term.identifier] = admin2_term

else: # pd.notna(admin1):
# If there's no admin 2, just annotate directly onto admin 1
term.append_relationship(part_of, admin1_term)

# We skip cities that don't meet the minimum population requirement
if int(population) < minimum_population:
continue
terms[identifier] = term = Term.from_triple("geonames", identifier,
name)
return terms

0 comments on commit 7a9b2e4

Please sign in to comment.