extract superclass->type to a separate program

EBISPOT · Aug 13, 2024 · 0b3b250 · 0b3b250
1 parent f3c8205
commit 0b3b250
Show file tree

Hide file tree

Showing 17 changed files with 289 additions and 59 deletions.
diff --git a/01_ingest/grebi_ingest_ols/src/main.rs b/01_ingest/grebi_ingest_ols/src/main.rs
@@ -24,10 +24,8 @@ struct Args {
     ontologies:String,
 
     #[arg(long)]
-    defining_only:bool,
+    defining_only:bool
 
-    #[arg(long)]
-    superclass_is_type:Vec<String>,
 }
 
 fn main() {
@@ -49,7 +47,6 @@ fn main() {
         ontology_whitelist.insert(ontology.to_string());
     }
 
-    let mut type_superclasses:HashSet<String> = args.superclass_is_type.iter().map(|x| x.to_string()).collect();
 
     let mut json = JsonStreamReader::new(reader);
 
@@ -60,14 +57,14 @@ fn main() {
     }
     json.begin_array().unwrap();
     while json.has_next().unwrap() {
-        read_ontology(&mut json, &mut output_nodes, &datasource_name, &ontology_whitelist, args.defining_only, &type_superclasses);
+        read_ontology(&mut json, &mut output_nodes, &datasource_name, &ontology_whitelist, args.defining_only);
     }
     json.end_array().unwrap();
     json.end_object().unwrap();
 
 }
 
-fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource_name: &str, ontology_whitelist:&HashSet<String>, defining_only:bool, type_superclasses:&HashSet<String>) {
+fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource_name: &str, ontology_whitelist:&HashSet<String>, defining_only:bool) {
 
     json.begin_object().unwrap();
 
@@ -131,11 +128,11 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
 
     loop {
         if key.eq("classes") {
-            read_entities(json, output_nodes, &datasource, "ols:Class", defining_only, &type_superclasses);
+            read_entities(json, output_nodes, &datasource, "ols:Class", defining_only);
         } else if key.eq("properties") {
-            read_entities(json, output_nodes, &datasource, "ols:Property", defining_only, &type_superclasses);
+            read_entities(json, output_nodes, &datasource, "ols:Property", defining_only);
         } else if key.eq("individuals") {
-            read_entities(json, output_nodes, &datasource, "ols:Individual", defining_only, &type_superclasses);
+            read_entities(json, output_nodes, &datasource, "ols:Individual", defining_only);
         } else {
             panic!();
         }
@@ -150,7 +147,7 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
 
 }
 
-fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool, type_superclasses:&HashSet<String>) {
+fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool) {
     json.begin_array().unwrap();
     while json.has_next().unwrap() {
         let mut val:Value = read_value(json);
@@ -214,15 +211,6 @@ fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
         output_nodes.write_all(r#"","grebi:type":[""#.as_bytes()).unwrap();
         output_nodes.write_all(grebitype.as_bytes()).unwrap();
         output_nodes.write_all(r#"""#.as_bytes()).unwrap();
-        if obj.contains_key("ols:directAncestor") {
-            for ancestor in get_string_values(obj.get("ols:directAncestor").unwrap()) { 
-                if type_superclasses.contains(ancestor) {
-                    output_nodes.write_all(r#",""#.as_bytes()).unwrap();
-                    write_escaped_string(&ancestor.as_bytes(), output_nodes);
-                    output_nodes.write_all(r#"""#.as_bytes()).unwrap();
-                }
-            }
-        }
         output_nodes.write_all(r#"]"#.as_bytes()).unwrap();
 
         for k in obj.keys() {

diff --git a/02_assign_ids/grebi_assign_ids/src/main.rs b/02_assign_ids/grebi_assign_ids/src/main.rs
@@ -11,6 +11,7 @@ use grebi_shared::json_parser::JsonParser;
 use clap::Parser;
 
 use grebi_shared::find_strings;
+use grebi_shared::load_groups_txt::load_groups_txt;
 
 
 #[derive(clap::Parser, Debug)]
@@ -44,35 +45,7 @@ fn main() {
 
     let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();
 
-    let id_to_group:HashMap<Vec<u8>, Vec<u8>> = {
-
-        let start_time = std::time::Instant::now();
-        let mut reader = BufReader::new(File::open( args.groups_txt ).unwrap() );
-        let mut mapping:HashMap<Vec<u8>, Vec<u8>> = HashMap::new();
-
-        loop {
-            let mut line: Vec<u8> = Vec::new();
-            reader.read_until(b'\n', &mut line).unwrap();
-
-            if line.len() == 0 {
-                break;
-            }
-            if line[line.len() - 1] == b'\n' {
-                line.pop();
-            }
-
-            let tokens:Vec<&[u8]> = line.split(|&x| x == b'\t').collect();
-
-            for i in 1..tokens.len() {
-                mapping.insert(tokens[i].to_vec(), tokens[0].to_vec());
-            }
-        }
-
-        eprintln!("loaded {} id->group mappings in {} seconds", mapping.len(), start_time.elapsed().as_secs());
-
-        mapping
-    };
-
+    let id_to_group:HashMap<Vec<u8>, Vec<u8>> = load_groups_txt(&args.groups_txt);
 
     let start_time = std::time::Instant::now();
 

diff --git a/02_assign_ids/grebi_identifiers2groups/src/main.rs b/02_assign_ids/grebi_identifiers2groups/src/main.rs
@@ -163,13 +163,17 @@ fn main() {
 //      - CURIEs
 //      - textual (readable) IDs rather than numeric
 //      - "grebi:" IDs always win (used to consolidate names on grebi:name etc.)
+//      - "biolink:" IDs are a second best
 // lower score is better
 //
 fn id_score(id:&[u8]) -> i32 {
 
 	if id.starts_with(b"grebi:") {
 		return i32::MIN;
 	}
+	if id.starts_with(b"biolink:") {
+		return i32::MIN+1000;
+	}
 
 	let mut score = 0;
 

diff --git a/02_assign_ids/grebi_superclasses2types/Cargo.toml b/02_assign_ids/grebi_superclasses2types/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "grebi_superclasses2types"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = { version = "1.0.108", features=["preserve_order"] }
+grebi_shared = { path = "../../grebi_shared" }
+csv = "1.3.0"
+fasthash = "0.4.0"
+lmdb-zero = "0.4.4"
+bloomfilter = "1.0.13"
+jemallocator = "0.5.4"
+clap = { version = "4.4.11", features = ["derive"] }
diff --git a/02_assign_ids/grebi_superclasses2types/src/main.rs b/02_assign_ids/grebi_superclasses2types/src/main.rs
@@ -0,0 +1,154 @@
+
+
+
+
+use std::collections::{BTreeSet, HashMap, HashSet};
+use std::fs::File;
+use std::{env, io};
+use std::io::{BufRead, BufReader };
+use std::io::{Write, BufWriter};
+use grebi_shared::json_lexer::{lex, JsonTokenType};
+use grebi_shared::json_parser::JsonParser;
+use clap::Parser;
+
+use grebi_shared::find_strings;
+use grebi_shared::load_groups_txt::load_groups_txt;
+
+
+#[derive(clap::Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+
+    #[arg(long)]
+    groups_txt:String,
+
+    #[arg(long)]
+    type_superclasses:String,
+
+}
+
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
+fn main() {
+
+    let args = Args::parse();
+
+    let mut type_superclasses:HashSet<Vec<u8>> = {
+        let id_to_group:HashMap<Vec<u8>, Vec<u8>> = load_groups_txt(args.groups_txt.as_str());
+        let mut res = HashSet::new();
+        for prop in args.type_superclasses.split(",") {
+            let mapped = id_to_group.get(prop.as_bytes());
+            if mapped.is_some() {
+                res.insert(mapped.unwrap().to_vec());
+            } else {
+                res.insert(prop.as_bytes().to_vec());
+            }
+        }
+        res
+    };
+
+    let start_time = std::time::Instant::now();
+
+    let stdin = io::stdin();
+    let handle = stdin.lock();
+    let mut reader = BufReader::new(handle);
+
+    let stdout = io::stdout().lock();
+    let mut writer = BufWriter::new(stdout);
+
+    loop {
+        let mut line: Vec<u8> = Vec::new();
+        reader.read_until(b'\n', &mut line).unwrap();
+
+        if line.len() == 0 {
+            break;
+        }
+
+        let mut json = JsonParser::parse(&line);
+
+        let mut id:Option<&[u8]> = None;
+        let mut types:BTreeSet<&[u8]> = BTreeSet::new();
+
+        json.begin_object();
+        json.mark();
+        while json.peek().kind != JsonTokenType::EndObject {
+            let prop_key = json.name();
+
+            if prop_key.eq(b"grebi:type") {
+                if json.peek().kind == JsonTokenType::StartArray {
+                    json.begin_array();
+                    while json.peek().kind != JsonTokenType::EndArray {
+                        types.insert(json.string());
+                    }
+                    json.end_array();
+                } else {
+                    types.insert(json.string());
+                }
+            } else if prop_key.eq(b"ols:directAncestor") {
+                if json.peek().kind == JsonTokenType::StartArray {
+                    json.begin_array();
+                    while json.peek().kind != JsonTokenType::EndArray {
+                        let ancestor = json.string();
+                        if type_superclasses.contains(ancestor) {
+                            types.insert(ancestor);
+                        }
+                    }
+                    json.end_array();
+                } else {
+                    let ancestor = json.string();
+                    if type_superclasses.contains(ancestor) {
+                        types.insert(ancestor);
+                    }
+                }
+            } else {
+                json.value(); // skip
+            }
+        }
+
+        json.rewind();
+
+        writer.write_all(b"{").unwrap();
+
+        let mut is_first = true;
+
+        while json.peek().kind != JsonTokenType::EndObject {
+            if is_first {
+                is_first = false;
+            } else {
+                writer.write_all(b",").unwrap();
+            }
+
+            let name = json.name();
+
+            writer.write_all(b"\"").unwrap();
+            writer.write_all(name).unwrap();
+            writer.write_all(b"\":").unwrap();
+
+            if name.eq(b"grebi:type") {
+                json.value(); // skip, we already have the types
+
+                writer.write_all(b"[").unwrap();
+                let mut is_first_type = true;
+                for t in types.iter() {
+                    if is_first_type {
+                        is_first_type = false;
+                    } else {
+                        writer.write_all(b",").unwrap();
+                    }
+                    writer.write_all(b"\"").unwrap();
+                    writer.write_all(t).unwrap();
+                    writer.write_all(b"\"").unwrap();
+                }
+                writer.write_all(b"]").unwrap();
+            } else {
+                writer.write_all(json.value()).unwrap();
+            }
+        }
+
+        writer.write_all(b"}\n").unwrap();
+    }
+
+    eprintln!("completed superclass2types in {}", start_time.elapsed().as_secs());
+
+}
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,6 +15,7 @@ members = [
 	"02_assign_ids/grebi_extract_identifiers",
 	"02_assign_ids/grebi_identifiers2groups",
 	"02_assign_ids/grebi_assign_ids",
+	"02_assign_ids/grebi_superclasses2types",
 	"03_merge/grebi_merge",
 	"04_index/grebi_index",
 	"05_materialise/grebi_materialise",

diff --git a/configs/datasource_configs/ols.json b/configs/datasource_configs/ols.json
@@ -6,11 +6,7 @@
             "ingest_files": ["/nfs/production/parkinso/spot/grebi/ontologies.json.gz"],
             "ingest_script": "./target/release/grebi_ingest_ols",
             "ingest_args": [
-                { "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl,so,eco,pr,ncbitaxon,oio,iao,biolink" },
-                { "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/MONDO_0000001" },
-                { "name": "--superclass-is-type", "value": "http://www.ebi.ac.uk/efo/EFO_0000408" },
-                { "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/CHEBI_36080" },
-                { "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/CHEBI_24431" }
+                { "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl,so,eco,pr,ncbitaxon,oio,iao,biolink" }
             ]
         }
     ]

diff --git a/configs/subgraph_configs/ebi_full_monarch.json b/configs/subgraph_configs/ebi_full_monarch.json
@@ -19,11 +19,17 @@
         "monarch:iri",
         "skos:exactMatch"
     ],
+    "type_superclasses": [
+        "mondo:0000001",
+        "efo:0000408",
+        "chebi:36080",
+        "chebi:24431"
+    ],
     "additional_equivalence_groups": [
         ["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],
         ["grebi:description", "iao:definition", "monarch:description", "ols:definition"],
         ["grebi:synonym", "monarch:synonym", "iao:alternative_label", "ols:synonym", "oboinowl:hasExactSynonym"],
-        ["mondo:0000001", "ogms:0000031"]
+        ["biolink:Disease", "mondo:0000001", "ogms:0000031"]
     ],
     "exclude_props": [
         "ols:curie",

diff --git a/configs/subgraph_configs/hett.json b/configs/subgraph_configs/hett.json
@@ -16,7 +16,14 @@
         "obo:chebi/smiles",
         "impc:pmId",
         "impc:humanGeneAccId",
-        "monarch:iri"
+        "monarch:iri",
+        "skos:exactMatch"
+    ],
+    "type_superclasses": [
+        "mondo:0000001",
+        "efo:0000408",
+        "chebi:36080",
+        "chebi:24431"
     ],
     "additional_equivalence_groups": [
         ["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],

diff --git a/configs/subgraph_configs/hra_kg.json b/configs/subgraph_configs/hra_kg.json
@@ -16,7 +16,14 @@
         "obo:chebi/smiles",
         "impc:pmId",
         "impc:humanGeneAccId",
-        "monarch:iri"
+        "monarch:iri",
+        "skos:exactMatch"
+    ],
+    "type_superclasses": [
+        "mondo:0000001",
+        "efo:0000408",
+        "chebi:36080",
+        "chebi:24431"
     ],
     "additional_equivalence_groups": [
         ["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],

diff --git a/configs/subgraph_configs/monarch.json b/configs/subgraph_configs/monarch.json
@@ -16,7 +16,14 @@
         "obo:chebi/smiles",
         "impc:pmId",
         "impc:humanGeneAccId",
-        "monarch:iri"
+        "monarch:iri",
+        "skos:exactMatch"
+    ],
+    "type_superclasses": [
+        "mondo:0000001",
+        "efo:0000408",
+        "chebi:36080",
+        "chebi:24431"
     ],
     "additional_equivalence_groups": [
         ["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],