Skip to content

Commit

Permalink
Assign ids to everything not just equivalence groups (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl authored Jul 29, 2024
1 parent 484d549 commit 38a9c8c
Show file tree
Hide file tree
Showing 24 changed files with 263 additions and 306 deletions.
1 change: 0 additions & 1 deletion 01_ingest/grebi_ingest_gwas/src/write_associations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use std::io::{BufWriter, self, BufReader, StdinLock, StdoutLock, Write};
use std::ptr::eq;
use grebi_shared::prefix_map::PrefixMap;
use grebi_shared::prefix_map::PrefixMapBuilder;
use grebi_shared::serialize_equivalence;
use serde_json::{json, Value};

use crate::check_headers::check_headers;
Expand Down
3 changes: 1 addition & 2 deletions 01_ingest/grebi_ingest_gwas/src/write_studies.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ use std::io::{BufWriter, self, BufReader, StdinLock, StdoutLock, Write};
use std::ptr::eq;
use grebi_shared::prefix_map::PrefixMap;
use grebi_shared::prefix_map::PrefixMapBuilder;
use grebi_shared::serialize_equivalence;
use serde_json::json;

use crate::check_headers::check_headers;
Expand Down Expand Up @@ -106,4 +105,4 @@ pub fn write_studies(csv_reader: &mut csv::Reader<BufReader<StdinLock>>,nodes_wr
}
}

}
}
11 changes: 0 additions & 11 deletions 01_ingest/grebi_ingest_ols/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use std::ptr::eq;
use clap::Parser;
use grebi_shared::prefix_map::PrefixMap;
use grebi_shared::prefix_map::PrefixMapBuilder;
use grebi_shared::serialize_equivalence;
use struson::reader::{JsonReader, JsonStreamReader, ValueType};
use serde_json::Value;
use serde_json::Map;
Expand Down Expand Up @@ -146,16 +145,6 @@ fn read_ontology(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_n

}

const EQUIV_PREDICATES :[&str;2]= [
"owl:equivalentClass",
"owl:equivalentProperty",
// "owl:sameAs",
// "skos:exactMatch",
// "oboinowl:hasAlternativeId",
// "uniprot:replaces",
// "iao:0100001" // -> replacement term
];

fn read_entities(json: &mut JsonStreamReader<BufReader<StdinLock<'_>>>, output_nodes: &mut BufWriter<StdoutLock>, datasource:&String, grebitype:&str, defining_only:bool) {
json.begin_array().unwrap();
while json.has_next().unwrap() {
Expand Down
1 change: 0 additions & 1 deletion 01_ingest/grebi_ingest_reactome/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ use std::env;
use clap::Parser;
use grebi_shared::prefix_map::PrefixMap;
use grebi_shared::prefix_map::PrefixMapBuilder;
use grebi_shared::serialize_equivalence;
use serde_json::json;
use serde_json::Value;

Expand Down
1 change: 0 additions & 1 deletion 01_ingest/grebi_ingest_sssom/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use std::ptr::eq;
use clap::Parser;
use grebi_shared::prefix_map::PrefixMap;
use grebi_shared::prefix_map::PrefixMapBuilder;
use grebi_shared::serialize_equivalence;
use serde_json::json;
use serde_yaml;

Expand Down
2 changes: 1 addition & 1 deletion 01_ingest/grebi_normalise_prefixes/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::{env, io};
use std::io::{BufRead, BufReader };
use std::io::{Write, BufWriter};

use grebi_shared::{get_subject, find_strings, serialize_equivalence, json_parser, json_lexer};
use grebi_shared::{get_subject, find_strings, json_parser, json_lexer};
use grebi_shared::prefix_map::PrefixMap;
use grebi_shared::prefix_map::PrefixMapBuilder;

Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ use grebi_shared::find_strings;
struct Args {

#[arg(long)]
add_prefix: String, // used to prepend the subgraph name like hra_kg:g:
identifier_properties:String,

#[arg(long)]
groups_txt: String,
Expand All @@ -34,9 +34,15 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
fn main() {

let args = Args::parse();
let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();

let add_prefix = args.add_prefix;

let mut id_props:HashSet<Vec<u8>> = HashSet::new();
for prop in args.identifier_properties.split(",") {
id_props.insert(prop.as_bytes().to_vec());
}


let preserve_fields:HashSet<Vec<u8>> = args.preserve_field.iter().map(|x| x.as_bytes().to_vec()).collect();

let id_to_group:HashMap<Vec<u8>, Vec<u8>> = {

Expand Down Expand Up @@ -95,29 +101,32 @@ fn main() {
while json.peek().kind != JsonTokenType::EndObject {
let prop_key = json.name();

if prop_key == b"id" {
id = Some(json.string());
// any of the IDs will do, we only need one
// as all identifiers map to the same group
//
if id_props.contains(prop_key) {
// TODO handle the same cases as the id extraction does
if json.peek().kind == JsonTokenType::StartArray {
json.begin_array();
id = Some(json.string());
} else {
id = Some(json.string());
}
break;
} else {
json.value(); // skip
}
}

let group = id_to_group.get(id.unwrap());
if group.is_some() {

// the subject mapped to an equivalence group
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(group.unwrap().as_slice()).unwrap();
writer.write_all("\"".as_bytes()).unwrap();
} else {
// the subject did not map to an equivalence group
writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(id.unwrap()).unwrap();
writer.write_all("\"".as_bytes()).unwrap();
if !group.is_some() {
panic!("could not find identifier group for id: {}", String::from_utf8(id.unwrap().to_vec()).unwrap());
}

writer.write_all("{\"grebi:nodeId\":\"".as_bytes()).unwrap();
writer.write_all(group.unwrap().as_slice()).unwrap();
writer.write_all("\"".as_bytes()).unwrap();

json.rewind();
while json.peek().kind != JsonTokenType::EndObject {

Expand All @@ -129,7 +138,6 @@ fn main() {
} else {
let name_group = id_to_group.get(name);
if name_group.is_some() {
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(name_group.unwrap()).unwrap();
} else {
writer.write_all(name).unwrap();
Expand All @@ -140,7 +148,7 @@ fn main() {
if name.eq(b"id") || preserve_fields.contains(name) {
writer.write_all(json.value()).unwrap();
} else {
write_value(&mut writer, json.value(), &id_to_group, &add_prefix);
write_value(&mut writer, json.value(), &id_to_group);
}
}

Expand All @@ -151,7 +159,7 @@ fn main() {

}

fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>, add_prefix:&str) {
fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&HashMap<Vec<u8>, Vec<u8>>) {

let string_locations = find_strings(&value);

Expand All @@ -174,7 +182,6 @@ fn write_value(writer:&mut BufWriter<io::StdoutLock>, value:&[u8], id_to_group:&

let pv_group = id_to_group.get(str);
if pv_group.is_some() {
writer.write_all(add_prefix.as_bytes()).unwrap();
writer.write_all(pv_group.unwrap()).unwrap();
} else {
writer.write_all(str).unwrap();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "grebi_extract_equivalences"
name = "grebi_extract_identifiers"
version = "0.1.0"
edition = "2021"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::{env, io};
use std::io::{BufRead, BufReader };
use std::io::{Write, BufWriter};

use grebi_shared::{get_subject, find_strings, serialize_equivalence, json_parser, json_lexer};
use grebi_shared::{get_subject, find_strings, json_parser, json_lexer};

use clap::Parser;

Expand All @@ -22,7 +22,7 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
struct Args {

#[arg(long)]
equivalence_properties:String
identifier_properties:String
}

fn main() {
Expand All @@ -36,13 +36,13 @@ fn main() {
let stdout = io::stdout().lock();
let mut writer = BufWriter::new(stdout);

let mut equiv_props:HashSet<Vec<u8>> = HashSet::new();
let mut id_props:HashSet<Vec<u8>> = HashSet::new();

let mut n_total = 0;

let args = Args::parse();
for prop in args.equivalence_properties.split(",") {
equiv_props.insert(prop.as_bytes().to_vec());
for prop in args.identifier_properties.split(",") {
id_props.insert(prop.as_bytes().to_vec());
}

loop {
Expand All @@ -54,32 +54,15 @@ fn main() {
}

let mut json = JsonParser::parse(&line);


let mut id:Option<&[u8]> = None;
json.begin_object();
json.mark();

while json.peek().kind != JsonTokenType::EndObject {
let name = json.name();
if name.eq("id".as_bytes()) {
id = Some(json.string());
break;
} else {
json.value(); // skip
}
}
json.rewind();

if id.is_none() {
panic!("Missing id field in JSON: {}", String::from_utf8(line).unwrap());
}
let mut wrote_any = false;

while json.peek().kind != JsonTokenType::EndObject {

let k = json.name();

if !equiv_props.contains(k) {
if !id_props.contains(k) {
json.value(); // skip
continue;
}
Expand All @@ -88,24 +71,33 @@ fn main() {
json.begin_array();
while json.peek().kind != JsonTokenType::EndArray {
if json.peek().kind == JsonTokenType::StartString {
let serialized = serialize_equivalence(id.unwrap(), json.string());
if serialized.is_some() {
writer.write_all(&serialized.unwrap()).unwrap();
if wrote_any {
writer.write_all(b"\t").unwrap();
} else {
wrote_any = true;
}
writer.write_all(&json.string()).unwrap();
} else {
json.value(); // skip
}
}
json.end_array();
} else if json.peek().kind == JsonTokenType::StartString {
let serialized = serialize_equivalence(id.unwrap(), json.string());
if serialized.is_some() {
writer.write_all(&serialized.unwrap()).unwrap();
if wrote_any {
writer.write_all(b"\t").unwrap();
} else {
wrote_any = true;
}
writer.write_all(&json.string()).unwrap();
} else {
json.value(); // skip
}
}
if !wrote_any {
panic!("no identifiers found in object {}", String::from_utf8_lossy(&line));
}

writer.write_all(b"\n").unwrap();

n_total = n_total + 1;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[package]
name = "grebi_equivalences2groups"
name = "grebi_identifiers2groups"
version = "0.1.0"
edition = "2021"

Expand Down
Loading

0 comments on commit 38a9c8c

Please sign in to comment.