Skip to content

Commit

Permalink
MRG: add Manifest::intersect_manifest to Rust core (#3305)
Browse files Browse the repository at this point in the history
This PR implements `Manifest::intersect_manifest` and
`Collection::intersect_manifest` for the Rust layer, which is needed to
support standalone manifests over in
sourmash-bio/sourmash_plugin_branchwater#430.

As part of this, the PR implements `Eq` and `Hash` traits for `Record`
so that `HashSet` can be used for efficient intersections.

Related PRs:
* sourmash-bio/sourmash_plugin_branchwater#430
  • Loading branch information
ctb authored Sep 21, 2024
1 parent 26b50f3 commit ada039a
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 1 deletion.
31 changes: 31 additions & 0 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,10 @@ impl Collection {
assert_eq!(sig.signatures.len(), 1);
Ok(sig)
}

pub fn intersect_manifest(&mut self, mf: &Manifest) {
self.manifest = self.manifest.intersect_manifest(mf);
}
}

impl Select for Collection {
Expand All @@ -233,6 +237,7 @@ mod test {
use super::Collection;

use crate::encodings::HashFunctions;
use crate::manifest::Manifest;
use crate::prelude::Select;
use crate::selection::Selection;
use crate::signature::Signature;
Expand Down Expand Up @@ -358,6 +363,32 @@ mod test {
assert_eq!(cl.len(), 0);
}

#[test]
fn collection_intersect_manifest() {
// load test sigs
let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
// four num=500 sigs
filename.push("../../tests/test-data/genome-s11.fa.gz.sig");
let file = File::open(filename).unwrap();
let reader = BufReader::new(file);
let sigs: Vec<Signature> = serde_json::from_reader(reader).expect("Loading error");
assert_eq!(sigs.len(), 4);
// load sigs into collection + select compatible signatures
let mut cl = Collection::from_sigs(sigs).unwrap();
// all sigs should remain
assert_eq!(cl.len(), 4);

// grab first record
let manifest = cl.manifest();
let record = manifest.iter().next().unwrap().clone();
let vr = vec![record];

// now intersect:
let manifest2 = Manifest::from(vr);
cl.intersect_manifest(&manifest2);
assert_eq!(cl.len(), 1);
}

#[test]
fn sigstore_sig_from_record() {
// load test sigs
Expand Down
105 changes: 104 additions & 1 deletion src/core/src/manifest.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use std::collections::HashSet;
use std::fs::File;
use std::hash::{Hash, Hasher};
use std::io::{BufRead, BufReader, Read, Write};
use std::ops::Deref;

Expand All @@ -17,7 +19,7 @@ use crate::Result;

/// Individual manifest record, containing information about sketches.

#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize, Clone, CopyGetters, Getters, Setters)]
pub struct Record {
#[getset(get = "pub", set = "pub")]
internal_location: PathBuf,
Expand Down Expand Up @@ -176,6 +178,37 @@ impl Record {
}
}

impl PartialEq for Record {
// match everything but internal_location
fn eq(&self, other: &Self) -> bool {
self.md5 == other.md5
&& self.ksize == other.ksize
&& self.moltype == other.moltype
&& self.scaled == other.scaled
&& self.num == other.num
&& self.n_hashes == other.n_hashes
&& self.with_abundance == other.with_abundance
&& self.name == other.name
&& self.filename == other.filename
}
}

impl Eq for Record {}

impl Hash for Record {
fn hash<H: Hasher>(&self, state: &mut H) {
self.md5.hash(state);
self.ksize.hash(state);
self.moltype.hash(state);
self.scaled.hash(state);
self.num.hash(state);
self.n_hashes.hash(state);
self.with_abundance.hash(state);
self.name.hash(state);
self.filename.hash(state);
}
}

impl Manifest {
pub fn from_reader<R: Read>(rdr: R) -> Result<Self> {
let mut records = vec![];
Expand Down Expand Up @@ -209,6 +242,20 @@ impl Manifest {
pub fn iter(&self) -> impl Iterator<Item = &Record> {
self.records.iter()
}

pub fn intersect_manifest(&self, other: &Manifest) -> Self {
// extract tuples from other mf:
let pairs: HashSet<_> = other.iter().collect();

let records = self
.records
.iter()
.filter(|row| pairs.contains(row))
.cloned()
.collect();

Self { records }
}
}

impl Select for Manifest {
Expand Down Expand Up @@ -521,4 +568,60 @@ mod test {
let scaled100 = manifest.select(&selection).unwrap();
assert_eq!(scaled100.len(), 6);
}

#[test]
fn manifest_intersect() {
let temp_dir = TempDir::new().unwrap();
let utf8_output = PathBuf::from_path_buf(temp_dir.path().to_path_buf())
.expect("Path should be valid UTF-8");
let filename = utf8_output.join("sig-pathlist.txt");
// build sig filenames
let base_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
let test_sigs = vec![
"../../tests/test-data/47.fa.sig",
"../../tests/test-data/63.fa.sig",
];

let full_paths: Vec<_> = test_sigs
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

// write a file in test directory with a filename on each line
let mut pathfile = File::create(&filename).unwrap();
for sigfile in &full_paths {
writeln!(pathfile, "{}", sigfile).unwrap();
}

// load into manifest
let manifest = Manifest::from(&filename);
assert_eq!(manifest.len(), 2);

// now do just one sketch -
let test_sigs2 = vec!["../../tests/test-data/63.fa.sig"];

let filename2 = utf8_output.join("sig-pathlist-single.txt");

let full_paths: Vec<_> = test_sigs2
.into_iter()
.map(|sig| base_path.join(sig))
.collect();

let mut pathfile2 = File::create(&filename2).unwrap();
for sigfile in &full_paths {
writeln!(pathfile2, "{}", sigfile).unwrap();
}

// load into another manifest
let manifest2 = Manifest::from(&filename2);
assert_eq!(manifest2.len(), 1);

// intersect with itself => same.
let new_mf = manifest2.intersect_manifest(&manifest);
assert_eq!(new_mf.len(), 1);

// intersect with other => single.
let new_mf = manifest.intersect_manifest(&manifest2);
assert_eq!(new_mf.len(), 1);
}
}

0 comments on commit ada039a

Please sign in to comment.