Skip to content

Commit

Permalink
extract superclass->type to a separate program
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Aug 13, 2024
1 parent f3c8205 commit 0b3b250
Show file tree
Hide file tree
Showing 17 changed files with 289 additions and 59 deletions.
26 changes: 7 additions & 19 deletions 01_ingest/grebi_ingest_ols/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,8 @@ struct Args {
ontologies:String,

#[arg(long)]
defining_only:bool,
defining_only:bool

#[arg(long)]
superclass_is_type:Vec<String>,
}

fn main() {
Expand All @@ -49,7 +47,6 @@ fn main() {
ontology_whitelist.insert(ontology.to_string());
}

let mut type_superclasses:HashSet<String> = args.superclass_is_type.iter().map(|x| x.to_string()).collect();

let mut json = JsonStreamReader::new(reader);

Expand All @@ -60,14 +57,14 @@ fn main() {
}
json.begin_array().unwrap();
while json.has_next().unwrap() {
read_ontology(&mut json, &mut output_nodes, &datasource_name, &ontology_whitelist, args.defining_only, &type_superclasses);
read_ontology(&mut json, &mut output_nodes, &datasource_name, &ontology_whitelist, args.defining_only);
}
json.end_array().unwrap();
json.end_object().unwrap();

}

fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource_name: &str, ontology_whitelist:&HashSet<String>, defining_only:bool, type_superclasses:&HashSet<String>) {
fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource_name: &str, ontology_whitelist:&HashSet<String>, defining_only:bool) {

json.begin_object().unwrap();

Expand Down Expand Up @@ -131,11 +128,11 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n

loop {
if key.eq("classes") {
read_entities(json, output_nodes, &datasource, "ols:Class", defining_only, &type_superclasses);
read_entities(json, output_nodes, &datasource, "ols:Class", defining_only);
} else if key.eq("properties") {
read_entities(json, output_nodes, &datasource, "ols:Property", defining_only, &type_superclasses);
read_entities(json, output_nodes, &datasource, "ols:Property", defining_only);
} else if key.eq("individuals") {
read_entities(json, output_nodes, &datasource, "ols:Individual", defining_only, &type_superclasses);
read_entities(json, output_nodes, &datasource, "ols:Individual", defining_only);
} else {
panic!();
}
Expand All @@ -150,7 +147,7 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n

}

fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool, type_superclasses:&HashSet<String>) {
fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool) {
json.begin_array().unwrap();
while json.has_next().unwrap() {
let mut val:Value = read_value(json);
Expand Down Expand Up @@ -214,15 +211,6 @@ fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n
output_nodes.write_all(r#"","grebi:type":[""#.as_bytes()).unwrap();
output_nodes.write_all(grebitype.as_bytes()).unwrap();
output_nodes.write_all(r#"""#.as_bytes()).unwrap();
if obj.contains_key("ols:directAncestor") {
for ancestor in get_string_values(obj.get("ols:directAncestor").unwrap()) {
if type_superclasses.contains(ancestor) {
output_nodes.write_all(r#",""#.as_bytes()).unwrap();
write_escaped_string(&ancestor.as_bytes(), output_nodes);
output_nodes.write_all(r#"""#.as_bytes()).unwrap();
}
}
}
output_nodes.write_all(r#"]"#.as_bytes()).unwrap();

for k in obj.keys() {
Expand Down
31 changes: 2 additions & 29 deletions 02_assign_ids/grebi_assign_ids/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use grebi_shared::json_parser::JsonParser;
use clap::Parser;

use grebi_shared::find_strings;
use grebi_shared::load_groups_txt::load_groups_txt;


#[derive(clap::Parser, Debug)]
Expand Down Expand Up @@ -44,35 +45,7 @@ fn main() {

let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();

let id_to_group:HashMap<Vec<u8>, Vec<u8>> = {

let start_time = std::time::Instant::now();
let mut reader = BufReader::new(File::open( args.groups_txt ).unwrap() );
let mut mapping:HashMap<Vec<u8>, Vec<u8>> = HashMap::new();

loop {
let mut line: Vec<u8> = Vec::new();
reader.read_until(b'\n', &mut line).unwrap();

if line.len() == 0 {
break;
}
if line[line.len() - 1] == b'\n' {
line.pop();
}

let tokens:Vec<&[u8]> = line.split(|&x| x == b'\t').collect();

for i in 1..tokens.len() {
mapping.insert(tokens[i].to_vec(), tokens[0].to_vec());
}
}

eprintln!("loaded {} id->group mappings in {} seconds", mapping.len(), start_time.elapsed().as_secs());

mapping
};

let id_to_group:HashMap<Vec<u8>, Vec<u8>> = load_groups_txt(&args.groups_txt);

let start_time = std::time::Instant::now();

Expand Down
4 changes: 4 additions & 0 deletions 02_assign_ids/grebi_identifiers2groups/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -163,13 +163,17 @@ fn main() {
// - CURIEs
// - textual (readable) IDs rather than numeric
// - "grebi:" IDs always win (used to consolidate names on grebi:name etc.)
// - "biolink:" IDs are a second best
// lower score is better
//
fn id_score(id:&[u8]) -> i32 {

if id.starts_with(b"grebi:") {
return i32::MIN;
}
if id.starts_with(b"biolink:") {
return i32::MIN+1000;
}

let mut score = 0;

Expand Down
14 changes: 14 additions & 0 deletions 02_assign_ids/grebi_superclasses2types/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[package]
name = "grebi_superclasses2types"
version = "0.1.0"
edition = "2021"

[dependencies]
serde_json = { version = "1.0.108", features=["preserve_order"] }
grebi_shared = { path = "../../grebi_shared" }
csv = "1.3.0"
fasthash = "0.4.0"
lmdb-zero = "0.4.4"
bloomfilter = "1.0.13"
jemallocator = "0.5.4"
clap = { version = "4.4.11", features = ["derive"] }
154 changes: 154 additions & 0 deletions 02_assign_ids/grebi_superclasses2types/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@




use std::collections::{BTreeSet, HashMap, HashSet};
use std::fs::File;
use std::{env, io};
use std::io::{BufRead, BufReader };
use std::io::{Write, BufWriter};
use grebi_shared::json_lexer::{lex, JsonTokenType};
use grebi_shared::json_parser::JsonParser;
use clap::Parser;

use grebi_shared::find_strings;
use grebi_shared::load_groups_txt::load_groups_txt;


#[derive(clap::Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {

#[arg(long)]
groups_txt:String,

#[arg(long)]
type_superclasses:String,

}

#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

fn main() {

let args = Args::parse();

let mut type_superclasses:HashSet<Vec<u8>> = {
let id_to_group:HashMap<Vec<u8>, Vec<u8>> = load_groups_txt(args.groups_txt.as_str());
let mut res = HashSet::new();
for prop in args.type_superclasses.split(",") {
let mapped = id_to_group.get(prop.as_bytes());
if mapped.is_some() {
res.insert(mapped.unwrap().to_vec());
} else {
res.insert(prop.as_bytes().to_vec());
}
}
res
};

let start_time = std::time::Instant::now();

let stdin = io::stdin();
let handle = stdin.lock();
let mut reader = BufReader::new(handle);

let stdout = io::stdout().lock();
let mut writer = BufWriter::new(stdout);

loop {
let mut line: Vec<u8> = Vec::new();
reader.read_until(b'\n', &mut line).unwrap();

if line.len() == 0 {
break;
}

let mut json = JsonParser::parse(&line);

let mut id:Option<&[u8]> = None;
let mut types:BTreeSet<&[u8]> = BTreeSet::new();

json.begin_object();
json.mark();
while json.peek().kind != JsonTokenType::EndObject {
let prop_key = json.name();

if prop_key.eq(b"grebi:type") {
if json.peek().kind == JsonTokenType::StartArray {
json.begin_array();
while json.peek().kind != JsonTokenType::EndArray {
types.insert(json.string());
}
json.end_array();
} else {
types.insert(json.string());
}
} else if prop_key.eq(b"ols:directAncestor") {
if json.peek().kind == JsonTokenType::StartArray {
json.begin_array();
while json.peek().kind != JsonTokenType::EndArray {
let ancestor = json.string();
if type_superclasses.contains(ancestor) {
types.insert(ancestor);
}
}
json.end_array();
} else {
let ancestor = json.string();
if type_superclasses.contains(ancestor) {
types.insert(ancestor);
}
}
} else {
json.value(); // skip
}
}

json.rewind();

writer.write_all(b"{").unwrap();

let mut is_first = true;

while json.peek().kind != JsonTokenType::EndObject {
if is_first {
is_first = false;
} else {
writer.write_all(b",").unwrap();
}

let name = json.name();

writer.write_all(b"\"").unwrap();
writer.write_all(name).unwrap();
writer.write_all(b"\":").unwrap();

if name.eq(b"grebi:type") {
json.value(); // skip, we already have the types

writer.write_all(b"[").unwrap();
let mut is_first_type = true;
for t in types.iter() {
if is_first_type {
is_first_type = false;
} else {
writer.write_all(b",").unwrap();
}
writer.write_all(b"\"").unwrap();
writer.write_all(t).unwrap();
writer.write_all(b"\"").unwrap();
}
writer.write_all(b"]").unwrap();
} else {
writer.write_all(json.value()).unwrap();
}
}

writer.write_all(b"}\n").unwrap();
}

eprintln!("completed superclass2types in {}", start_time.elapsed().as_secs());

}
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ members = [
"02_assign_ids/grebi_extract_identifiers",
"02_assign_ids/grebi_identifiers2groups",
"02_assign_ids/grebi_assign_ids",
"02_assign_ids/grebi_superclasses2types",
"03_merge/grebi_merge",
"04_index/grebi_index",
"05_materialise/grebi_materialise",
Expand Down
6 changes: 1 addition & 5 deletions configs/datasource_configs/ols.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,7 @@
"ingest_files": ["/nfs/production/parkinso/spot/grebi/ontologies.json.gz"],
"ingest_script": "./target/release/grebi_ingest_ols",
"ingest_args": [
{ "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl,so,eco,pr,ncbitaxon,oio,iao,biolink" },
{ "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/MONDO_0000001" },
{ "name": "--superclass-is-type", "value": "http://www.ebi.ac.uk/efo/EFO_0000408" },
{ "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/CHEBI_36080" },
{ "name": "--superclass-is-type", "value": "http://purl.obolibrary.org/obo/CHEBI_24431" }
{ "name": "--ontologies", "value": "efo,mp,hp,go,ro,iao,uberon,pato,oba,chebi,bspo,iao,obi,bfo,cob,cl,so,eco,pr,ncbitaxon,oio,iao,biolink" }
]
}
]
Expand Down
8 changes: 7 additions & 1 deletion configs/subgraph_configs/ebi_full_monarch.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,17 @@
"monarch:iri",
"skos:exactMatch"
],
"type_superclasses": [
"mondo:0000001",
"efo:0000408",
"chebi:36080",
"chebi:24431"
],
"additional_equivalence_groups": [
["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],
["grebi:description", "iao:definition", "monarch:description", "ols:definition"],
["grebi:synonym", "monarch:synonym", "iao:alternative_label", "ols:synonym", "oboinowl:hasExactSynonym"],
["mondo:0000001", "ogms:0000031"]
["biolink:Disease", "mondo:0000001", "ogms:0000031"]
],
"exclude_props": [
"ols:curie",
Expand Down
9 changes: 8 additions & 1 deletion configs/subgraph_configs/hett.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
"obo:chebi/smiles",
"impc:pmId",
"impc:humanGeneAccId",
"monarch:iri"
"monarch:iri",
"skos:exactMatch"
],
"type_superclasses": [
"mondo:0000001",
"efo:0000408",
"chebi:36080",
"chebi:24431"
],
"additional_equivalence_groups": [
["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],
Expand Down
9 changes: 8 additions & 1 deletion configs/subgraph_configs/hra_kg.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
"obo:chebi/smiles",
"impc:pmId",
"impc:humanGeneAccId",
"monarch:iri"
"monarch:iri",
"skos:exactMatch"
],
"type_superclasses": [
"mondo:0000001",
"efo:0000408",
"chebi:36080",
"chebi:24431"
],
"additional_equivalence_groups": [
["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],
Expand Down
9 changes: 8 additions & 1 deletion configs/subgraph_configs/monarch.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,14 @@
"obo:chebi/smiles",
"impc:pmId",
"impc:humanGeneAccId",
"monarch:iri"
"monarch:iri",
"skos:exactMatch"
],
"type_superclasses": [
"mondo:0000001",
"efo:0000408",
"chebi:36080",
"chebi:24431"
],
"additional_equivalence_groups": [
["grebi:name", "ols:label", "rdfs:label", "monarch:name", "impc:name", "reactome:displayName"],
Expand Down
Loading

0 comments on commit 0b3b250

Please sign in to comment.