From 5ff25e64a0fc77326d4b78962ffd2bcc8769ebe6 Mon Sep 17 00:00:00 2001 From: James McLaughlin Date: Wed, 11 Sep 2024 15:45:50 +0100 Subject: [PATCH] fix hgnc id pollution caused by mondo in ols --- 01_ingest/grebi_ingest_ols/src/main.rs | 87 ++++++++++--------- .../grebi_extract_identifiers/src/main.rs | 22 ++++- configs/datasource_configs/hgnc.json | 4 + 3 files changed, 71 insertions(+), 42 deletions(-) diff --git a/01_ingest/grebi_ingest_ols/src/main.rs b/01_ingest/grebi_ingest_ols/src/main.rs index e9baded..2dafb8c 100644 --- a/01_ingest/grebi_ingest_ols/src/main.rs +++ b/01_ingest/grebi_ingest_ols/src/main.rs @@ -172,51 +172,58 @@ fn read_entities(json: &mut JsonStreamReader>>, output_n } } - //if grebitype.eq("ols:Property") { - - let qualified_safe_label = { - let curie = get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().to_string(); - let pref_prefix = { - if curie.contains(":") { - Some(curie.split(":").next().unwrap().to_ascii_lowercase()) + let qualified_safe_label = { + let curie = get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().to_string(); + let pref_prefix = { + if curie.contains(":") { + Some(curie.split(":").next().unwrap().to_ascii_lowercase()) + } else { + let definedBy = obj.get("ols:definedBy"); + if definedBy.is_some() { + Some(get_string_values(definedBy.unwrap()).iter().next().unwrap().to_string()) } else { - let definedBy = obj.get("ols:definedBy"); - if definedBy.is_some() { - Some(get_string_values(definedBy.unwrap()).iter().next().unwrap().to_string()) - } else { - None - } + None } - }; - if !pref_prefix.is_some() { - curie.to_string() - } else { - let pref_prefix_u = pref_prefix.unwrap().to_string(); - let label = get_string_values(obj.get("ols:label").unwrap()).iter().next().unwrap().to_string(); - - // this might not be a real label, in which case just return the curie - if label.starts_with(&(pref_prefix_u.to_owned() + ":")) || label.starts_with(&(pref_prefix_u.to_owned() + "_")) { - curie.to_string() - } else { - pref_prefix_u.to_string() + ":" + &label.to_string().as_bytes().iter().map(|x| { - if x.is_ascii_alphanumeric() { - *x as char - } else { - '_' - } - }).collect::() - } - } + } }; + if !pref_prefix.is_some() { + obj.get("ols:iri").unwrap().as_str().unwrap().to_string() + } else { + let pref_prefix_u = pref_prefix.unwrap().to_string(); + let label = get_string_values(obj.get("ols:label").unwrap()).iter().next().unwrap().to_string(); - output_nodes.write_all(r#"{"id":"#.as_bytes()).unwrap(); - output_nodes.write_all(Value::String(qualified_safe_label).to_string().as_bytes()).unwrap(); - /*} else { - output_nodes.write_all(r#"{"id":"#.as_bytes()).unwrap(); - let curie = get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().to_string(); - output_nodes.write_all(Value::String(curie).to_string().as_bytes()).unwrap(); - }*/ + // this might not be a real label, in which case just return the curie + if label.starts_with(&(pref_prefix_u.to_owned() + ":")) || label.starts_with(&(pref_prefix_u.to_owned() + "_")) { + curie.to_string() + } else { + pref_prefix_u.to_string() + ":" + &label.to_string().as_bytes().iter().map(|x| { + if x.is_ascii_alphanumeric() { + *x as char + } else { + '_' + } + }).collect::() + } + } + }; + + // Remove unprefixed IDs to avoid polluting ID space, e.g. + // https://www.ebi.ac.uk/ols4/api/v2/ontologies/mondo/classes/http%253A%252F%252Fidentifiers.org%252Fhgnc%252F4044 + // TODO: fix this in OLS? + // + if obj.contains_key("ols:curie") { + if !get_string_values(obj.get("ols:curie").unwrap()).iter().next().unwrap().contains(":") { + obj.remove_entry("ols:curie"); + } + } + if obj.contains_key("ols:shortForm") { + if !get_string_values(obj.get("ols:shortForm").unwrap()).iter().next().unwrap().contains("_") { + obj.remove_entry("ols:shortForm"); + } + } + output_nodes.write_all(r#"{"id":"#.as_bytes()).unwrap(); + output_nodes.write_all(Value::String(qualified_safe_label).to_string().as_bytes()).unwrap(); output_nodes.write_all(r#","grebi:datasource":""#.as_bytes()).unwrap(); output_nodes.write_all(datasource.as_bytes()).unwrap(); output_nodes.write_all(r#"","grebi:type":[""#.as_bytes()).unwrap(); diff --git a/02_assign_ids/grebi_extract_identifiers/src/main.rs b/02_assign_ids/grebi_extract_identifiers/src/main.rs index d19c361..5230e4e 100644 --- a/02_assign_ids/grebi_extract_identifiers/src/main.rs +++ b/02_assign_ids/grebi_extract_identifiers/src/main.rs @@ -76,7 +76,9 @@ fn main() { } else { wrote_any = true; } - writer.write_all(&json.string()).unwrap(); + let id = json.string(); + check_id(&k, &id); + writer.write_all(&id).unwrap(); } else { json.value(); // skip } @@ -88,7 +90,9 @@ fn main() { } else { wrote_any = true; } - writer.write_all(&json.string()).unwrap(); + let id = json.string(); + check_id(&k, &id); + writer.write_all(&id).unwrap(); } else { json.value(); // skip } @@ -110,5 +114,19 @@ fn main() { } +fn check_id(k:&[u8], id:&[u8]) { + let mut has_non_numeric = false; + for c in id { + if !c.is_ascii_digit() { + has_non_numeric = true; + break; + } + } + if !has_non_numeric { + panic!("Found unprefixed numeric ID {} for identifier property {}. Unqualified numbers like this as identifiers are ambiguous and may cause incorrect equivalences.", String::from_utf8_lossy(id), String::from_utf8_lossy(k)); + } +} + + diff --git a/configs/datasource_configs/hgnc.json b/configs/datasource_configs/hgnc.json index 977bc7f..858e8f9 100644 --- a/configs/datasource_configs/hgnc.json +++ b/configs/datasource_configs/hgnc.json @@ -13,6 +13,10 @@ { "name": "--json-inject-key-prefix", "value": "hgnc:" }, { "name": "--json-inject-value-prefix", "value": "uniprot_ids:uniprot:" }, { "name": "--json-inject-value-prefix", "value": "omim_id:omim:" }, + { "name": "--json-inject-value-prefix", "value": "ena:ena:" }, + { "name": "--json-inject-value-prefix", "value": "vega_id:vega:" }, + { "name": "--json-inject-value-prefix", "value": "ccds_id:ccds:" }, + { "name": "--json-inject-value-prefix", "value": "entrez_id:entrez:" }, { "name": "--json-inject-value-prefix", "value": "pubmed_id:pmid:" } ] }