From a7b5ae0e0e12772bc1a4fd475ba19a4dba029f9e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 15 Oct 2024 11:49:19 -0700 Subject: [PATCH] MRG: update to code for forthcoming sourmash release (#467) * update to next sourmash release * cargo fmt * upd sourmash * correct numbers * upd sourmash * upd sourmash * upd sourmash * use new try_into() and eliminate several clone()s * upd sourmash * update to sourmash r0.16.0 :tada: * fix mambaforge -> miniforge --- .github/workflows/build-test.yml | 1 - Cargo.lock | 32 +++++++++++----------- Cargo.toml | 2 +- src/manysearch.rs | 1 + src/python/tests/test_fastmultigather.py | 4 +-- src/utils.rs | 35 ++++++++++++++---------- 6 files changed, 40 insertions(+), 35 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 51041d65..c9fac079 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -29,7 +29,6 @@ jobs: auto-update-conda: true python-version: 3.12 channels: conda-forge,bioconda - miniforge-variant: Mambaforge miniforge-version: latest use-mamba: true mamba-version: "*" diff --git a/Cargo.lock b/Cargo.lock index f0de5653..87c0d5e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,9 +713,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -1548,9 +1548,9 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.15.2" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a73bae93170d8d0f816e18b6a630d76e134b90958850985ee2f0fb2f641d4de" +checksum = "596f20eac8896a06ca65889399ea6f408deeba375aa44c4a2efb3b46e31a02c0" dependencies = [ "az", "byteorder", @@ -1803,9 +1803,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -1814,9 +1814,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", @@ -1829,9 +1829,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1839,9 +1839,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", @@ -1852,15 +1852,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 3f786322..2c40a4de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.4", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } -sourmash = { version = "0.15.2", features = ["branchwater"] } +sourmash = { version = "0.16.0", features = ["branchwater"] } serde_json = "1.0.128" niffler = "2.4.0" log = "0.4.22" diff --git a/src/manysearch.rs b/src/manysearch.rs index d343493d..199af8b5 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -219,6 +219,7 @@ fn downsample_and_inflate_abundances( // avoid downsampling if we can if against_scaled != query_scaled { let against_ds = against + .clone() .downsample_scaled(query.scaled()) .expect("cannot downsample sketch"); (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index f485b7f0..653b0ba5 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -959,7 +959,7 @@ def test_indexed_full_output(runtmp): # check a few columns average_ani = set(df['average_containment_ani']) avg_ani = set([round(x, 4) for x in average_ani]) - assert avg_ani == {0.8502, 0.8584, 0.8602} + assert avg_ani == {0.9221, 0.9306, 0.9316} # @CTB check against py gather f_unique_weighted = set(df['f_unique_weighted']) f_unique_weighted = set([round(x, 4) for x in f_unique_weighted]) @@ -967,7 +967,7 @@ def test_indexed_full_output(runtmp): unique_intersect_bp = set(df['unique_intersect_bp']) unique_intersect_bp = set([round(x,4) for x in unique_intersect_bp]) - assert unique_intersect_bp == {44000, 18000, 22000} + assert unique_intersect_bp == {4400000, 1800000, 2200000} def test_nonindexed_full_vs_sourmash_gather(runtmp): diff --git a/src/utils.rs b/src/utils.rs index 33f78316..0b7df6c9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -476,25 +476,30 @@ pub fn load_sketches_above_threshold( let mut results = Vec::new(); // Load against into memory if let Ok(against_sig) = against_collection.sig_from_record(against_record) { - if let Some(against_mh) = against_sig.minhash() { - // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh.downsample_scaled(query.scaled()).unwrap(); - if let Ok(overlap) = against_mh_ds.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_record.name().to_string(), - md5sum: against_mh.md5sum(), - minhash: against_mh_ds.clone(), - location: against_record.internal_location().to_string(), - overlap, - }; - results.push(result); - } + let against_filename = against_sig.filename(); + let against_mh: KmerMinHash = against_sig.try_into().expect("cannot get sketch"); + let against_md5 = against_mh.md5sum(); // keep original md5sum + + let against_mh_ds = against_mh + .downsample_scaled(query.scaled()) + .expect("cannot downsample sketch"); + + // good? ok, store as candidate from prefetch. + if let Ok(overlap) = against_mh_ds.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_record.name().to_string(), + md5sum: against_md5, + minhash: against_mh_ds, + location: against_record.internal_location().to_string(), + overlap, + }; + results.push(result); } } else { eprintln!( "WARNING: no compatible sketches in path '{}'", - against_sig.filename() + against_filename ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); }