Skip to content

Commit

Permalink
faster identifiers2groups
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesamcl committed Aug 1, 2024
1 parent 0fbd15c commit 6f14406
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 15 deletions.
3 changes: 3 additions & 0 deletions 02_assign_ids/grebi_identifiers2groups/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@ csv = "1.3.0"
fasthash = "0.4.0"
lmdb-zero = "0.4.4"
bloomfilter = "1.0.13"
jemallocator = "0.5.4"
clap = { version = "4.4.11", features = ["derive"] }
hashbrown = "0.14.5"
fxhash = "0.2.1"

42 changes: 27 additions & 15 deletions 02_assign_ids/grebi_identifiers2groups/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@

use std::collections::{BTreeSet, BTreeMap};
use std::{env, io};
use hashbrown::HashMap;
use hashbrown::HashSet;
use csv;
use bloomfilter::Bloom;
use clap::Parser;
use std::io::{BufRead, BufReader };
use std::io::{Write, BufWriter};

#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

#[derive(clap::Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
Expand All @@ -15,21 +19,25 @@ struct Args {
add_group: Vec<String>,

#[arg(long)]
add_prefix: String // used to prepend the subgraph name like hra_kg:g:
add_prefix: String, // used to prepend the subgraph name like hra_kg:g:

#[arg(long)]
prealloc_size: usize
}


fn main() {

let mut group_to_entities:BTreeMap<u64, BTreeSet<Vec<u8>>> = BTreeMap::new();
let mut entity_to_group:BTreeMap<Vec<u8>, u64> = BTreeMap::new();
let args = Args::parse();

let mut group_to_entities:HashMap<u64, HashSet<Vec<u8>>> = HashMap::with_capacity(args.prealloc_size);
let mut entity_to_group:HashMap<Vec<u8>, u64> = HashMap::with_capacity(args.prealloc_size);

let mut next_group_id:u64 = 1;

let args = Args::parse();
let add_group:Vec<String> = args.add_group;
for group in add_group {
let entries:BTreeSet<Vec<u8>> = group.split(",").map(|s| s.as_bytes().to_vec()).collect();
let entries:HashSet<Vec<u8>> = group.split(",").map(|s| s.as_bytes().to_vec()).collect();
let gid = next_group_id;
next_group_id = next_group_id + 1;
for id in &entries {
Expand All @@ -51,10 +59,13 @@ fn main() {
loop {
let mut line: Vec<u8> = Vec::new();
reader.read_until(b'\n', &mut line).unwrap();
//if line.len() > 1000 {
//eprintln!("warn: super long line: {}", String::from_utf8_lossy(&line));
//}

n = n + 1;
if n % 1000000 == 0 {
eprintln!("...{} lines in {} seconds", n, start_time.elapsed().as_secs());
eprintln!("...{} lines in {} seconds [{} groups, {} entities, next group id {}]", n, start_time.elapsed().as_secs(), group_to_entities.len(), entity_to_group.len(), next_group_id);
}


Expand All @@ -65,11 +76,12 @@ fn main() {
line.pop();
}

let mut ids:Vec<Vec<u8>> = line.split(|&byte| byte == b'\t').map(|id| id.to_vec()).collect();
//let mut ids:Vec<Vec<u8>> = line.split(|&byte| byte == b'\t').map(|id| id.to_vec()).collect();
//let mut ids:Vec<&[u8]> = line.split(|&byte| byte == b'\t').collect();

let mut target_group:u64 = 0;
for id in &ids {
let g = entity_to_group.get(id);
for id in line.split(|&byte| byte == b'\t') {
let g = entity_to_group.get::<[u8]>(&id);
if g.is_some() {
target_group = *g.unwrap();
break;
Expand All @@ -79,11 +91,11 @@ fn main() {
if target_group != 0 {
// at least one of the ids already had a group;
// put everything else into it
for id in &ids {
let g2 = entity_to_group.get(id);
for id in line.split(|&byte| byte == b'\t') {
let g2 = entity_to_group.get::<[u8]>(id);
if g2.is_some() && *g2.unwrap() != target_group {
// this id already had a group different to ours
let entities_in_b = group_to_entities.remove(&g2.unwrap()).unwrap();
let entities_in_b = group_to_entities.remove(g2.unwrap()).unwrap();
for e in entities_in_b.clone() {
entity_to_group.insert(e, target_group);
}
Expand All @@ -100,10 +112,10 @@ fn main() {
// none of the ids had a group so we make a new one
target_group = next_group_id;
next_group_id = next_group_id + 1;
for id in &ids {
for id in line.split(|&byte| byte == b'\t') {
entity_to_group.insert(id.to_vec(), target_group);
}
group_to_entities.insert(target_group, ids.iter().map(|id| id.to_vec()).collect::<BTreeSet<_>>());
group_to_entities.insert(target_group, line.split(|&byte| byte == b'\t').map(|id| id.to_vec()).collect::<HashSet<_>>());
}
}

Expand Down

0 comments on commit 6f14406

Please sign in to comment.