From 69fd38bed0db859786a4f55c6b16f14a46f8746c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 09:57:03 -0400 Subject: [PATCH 01/11] update to next sourmash release --- Cargo.lock | 5 ++--- Cargo.toml | 3 ++- src/manysearch.rs | 1 + src/utils.rs | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f4acf157..968f5cea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1548,9 +1548,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a73bae93170d8d0f816e18b6a630d76e134b90958850985ee2f0fb2f641d4de" +version = "0.16.0" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#9b9fc5a4d40521e14390766fb6ffde4c6921062c" dependencies = [ "az", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 7935aa38..7dbc99cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,8 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.3", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } -sourmash = { version = "0.15.2", features = ["branchwater"] } +#sourmash = { version = "0.15.2", features = ["branchwater"] } +sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "refactor_rs_downsample", features = ["branchwater"] } serde_json = "1.0.128" niffler = "2.4.0" log = "0.4.22" diff --git a/src/manysearch.rs b/src/manysearch.rs index d343493d..199af8b5 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -219,6 +219,7 @@ fn downsample_and_inflate_abundances( // avoid downsampling if we can if against_scaled != query_scaled { let against_ds = against + .clone() .downsample_scaled(query.scaled()) .expect("cannot downsample sketch"); (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; diff --git a/src/utils.rs b/src/utils.rs index 33f78316..f0a81d5f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -478,7 +478,7 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = against_collection.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh.downsample_scaled(query.scaled()).unwrap(); + let against_mh_ds = against_mh.clone().downsample_scaled(query.scaled()).unwrap(); if let Ok(overlap) = against_mh_ds.count_common(query, false) { if overlap >= threshold_hashes { let result = PrefetchResult { From ee580b683ae9f5564b5c48c8e70cda4586c8fbe9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 10:04:16 -0400 Subject: [PATCH 02/11] cargo fmt --- src/utils.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils.rs b/src/utils.rs index f0a81d5f..a702378f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -478,7 +478,10 @@ pub fn load_sketches_above_threshold( if let Ok(against_sig) = against_collection.sig_from_record(against_record) { if let Some(against_mh) = against_sig.minhash() { // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh.clone().downsample_scaled(query.scaled()).unwrap(); + let against_mh_ds = against_mh + .clone() + .downsample_scaled(query.scaled()) + .unwrap(); if let Ok(overlap) = against_mh_ds.count_common(query, false) { if overlap >= threshold_hashes { let result = PrefetchResult { From 981405166f6e4c341242357db28c1b58beddbe5e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 15:53:16 -0400 Subject: [PATCH 03/11] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 968f5cea..7e2264f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#9b9fc5a4d40521e14390766fb6ffde4c6921062c" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#79afb857967d5f48393341a77e43ea27ab3caf22" dependencies = [ "az", "byteorder", From d27b03e8e677a6b96d866ea2c89208dcc7135aa4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 11 Oct 2024 16:38:32 -0400 Subject: [PATCH 04/11] correct numbers --- Cargo.lock | 2 +- src/python/tests/test_fastmultigather.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e2264f7..7b1eb621 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#79afb857967d5f48393341a77e43ea27ab3caf22" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ddcb049e99749f1d16c414e0fdb2d06d55a38db7" dependencies = [ "az", "byteorder", diff --git a/src/python/tests/test_fastmultigather.py b/src/python/tests/test_fastmultigather.py index f485b7f0..653b0ba5 100644 --- a/src/python/tests/test_fastmultigather.py +++ b/src/python/tests/test_fastmultigather.py @@ -959,7 +959,7 @@ def test_indexed_full_output(runtmp): # check a few columns average_ani = set(df['average_containment_ani']) avg_ani = set([round(x, 4) for x in average_ani]) - assert avg_ani == {0.8502, 0.8584, 0.8602} + assert avg_ani == {0.9221, 0.9306, 0.9316} # @CTB check against py gather f_unique_weighted = set(df['f_unique_weighted']) f_unique_weighted = set([round(x, 4) for x in f_unique_weighted]) @@ -967,7 +967,7 @@ def test_indexed_full_output(runtmp): unique_intersect_bp = set(df['unique_intersect_bp']) unique_intersect_bp = set([round(x,4) for x in unique_intersect_bp]) - assert unique_intersect_bp == {44000, 18000, 22000} + assert unique_intersect_bp == {4400000, 1800000, 2200000} def test_nonindexed_full_vs_sourmash_gather(runtmp): From e35111a7d2a6a5c510a23b04f8bb091a4ca30e76 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:08:38 -0400 Subject: [PATCH 05/11] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 7b1eb621..da608df5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ddcb049e99749f1d16c414e0fdb2d06d55a38db7" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#6eb86a390c53fc243bf65dd38fab3d1712c9f579" dependencies = [ "az", "byteorder", From 4778862e9def28528a7e9cc65d8d75c6ec2dc9f0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Oct 2024 06:24:42 -0400 Subject: [PATCH 06/11] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index da608df5..f1218059 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#6eb86a390c53fc243bf65dd38fab3d1712c9f579" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#62f03eb3de8f4b05307efad74f321ced04de40f1" dependencies = [ "az", "byteorder", From a0e02efb86ad084e1fcd18585a5f71253ca105ad Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 07:19:58 -0400 Subject: [PATCH 07/11] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index f1218059..a56788cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#62f03eb3de8f4b05307efad74f321ced04de40f1" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#e4e5555fd81a9a8677bbe065cf7f528270b01fed" dependencies = [ "az", "byteorder", From 9b448c8a873e9fde1bc4cb84f441863078677eb9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 07:29:52 -0400 Subject: [PATCH 08/11] use new try_into() and eliminate several clone()s --- src/utils.rs | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/utils.rs b/src/utils.rs index a702378f..0b7df6c9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -476,28 +476,30 @@ pub fn load_sketches_above_threshold( let mut results = Vec::new(); // Load against into memory if let Ok(against_sig) = against_collection.sig_from_record(against_record) { - if let Some(against_mh) = against_sig.minhash() { - // downsample against_mh, but keep original md5sum - let against_mh_ds = against_mh - .clone() - .downsample_scaled(query.scaled()) - .unwrap(); - if let Ok(overlap) = against_mh_ds.count_common(query, false) { - if overlap >= threshold_hashes { - let result = PrefetchResult { - name: against_record.name().to_string(), - md5sum: against_mh.md5sum(), - minhash: against_mh_ds.clone(), - location: against_record.internal_location().to_string(), - overlap, - }; - results.push(result); - } + let against_filename = against_sig.filename(); + let against_mh: KmerMinHash = against_sig.try_into().expect("cannot get sketch"); + let against_md5 = against_mh.md5sum(); // keep original md5sum + + let against_mh_ds = against_mh + .downsample_scaled(query.scaled()) + .expect("cannot downsample sketch"); + + // good? ok, store as candidate from prefetch. + if let Ok(overlap) = against_mh_ds.count_common(query, false) { + if overlap >= threshold_hashes { + let result = PrefetchResult { + name: against_record.name().to_string(), + md5sum: against_md5, + minhash: against_mh_ds, + location: against_record.internal_location().to_string(), + overlap, + }; + results.push(result); } } else { eprintln!( "WARNING: no compatible sketches in path '{}'", - against_sig.filename() + against_filename ); let _i = skipped_paths.fetch_add(1, atomic::Ordering::SeqCst); } From 18a363e4adb120fbe43b77ec41fc5158c414107b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Oct 2024 10:45:13 -0400 Subject: [PATCH 09/11] upd sourmash --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index a56788cf..14c16762 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1549,7 +1549,7 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#e4e5555fd81a9a8677bbe065cf7f528270b01fed" +source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ceaea393d95b3b85575b51c20784d3b9442da149" dependencies = [ "az", "byteorder", From be4c39c682f0a2a6ea3aed2a8aadf3b3072b764c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 15 Oct 2024 14:41:44 -0400 Subject: [PATCH 10/11] update to sourmash r0.16.0 :tada: --- Cargo.lock | 31 ++++++++++++++++--------------- Cargo.toml | 3 +-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 37308eaf..87c0d5e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -713,9 +713,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" +checksum = "6a88f1bda2bd75b0452a14784937d796722fdebfe50df998aeb3f0b7603019a9" dependencies = [ "wasm-bindgen", ] @@ -1549,7 +1549,8 @@ checksum = "bceb57dc07c92cdae60f5b27b3fa92ecaaa42fe36c55e22dbfb0b44893e0b1f7" [[package]] name = "sourmash" version = "0.16.0" -source = "git+https://github.com/sourmash-bio/sourmash.git?branch=refactor_rs_downsample#ceaea393d95b3b85575b51c20784d3b9442da149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "596f20eac8896a06ca65889399ea6f408deeba375aa44c4a2efb3b46e31a02c0" dependencies = [ "az", "byteorder", @@ -1802,9 +1803,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" +checksum = "128d1e363af62632b8eb57219c8fd7877144af57558fb2ef0368d0087bddeb2e" dependencies = [ "cfg-if", "once_cell", @@ -1813,9 +1814,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" +checksum = "cb6dd4d3ca0ddffd1dd1c9c04f94b868c37ff5fac97c30b97cff2d74fce3a358" dependencies = [ "bumpalo", "log", @@ -1828,9 +1829,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" +checksum = "e79384be7f8f5a9dd5d7167216f022090cf1f9ec128e6e6a482a2cb5c5422c56" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -1838,9 +1839,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" +checksum = "26c6ab57572f7a24a4985830b120de1594465e5d500f24afe89e16b4e833ef68" dependencies = [ "proc-macro2", "quote", @@ -1851,15 +1852,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.93" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" +checksum = "65fc09f10666a9f147042251e0dda9c18f166ff7de300607007e96bdebc1068d" [[package]] name = "web-sys" -version = "0.3.70" +version = "0.3.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" +checksum = "f6488b90108c040df0fe62fa815cbdee25124641df01814dd7282749234c6112" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/Cargo.toml b/Cargo.toml index 8491762f..2c40a4de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,8 +12,7 @@ crate-type = ["cdylib"] pyo3 = { version = "0.22.4", features = ["extension-module", "anyhow"] } rayon = "1.10.0" serde = { version = "1.0.210", features = ["derive"] } -#sourmash = { version = "0.15.2", features = ["branchwater"] } -sourmash = { git = "https://github.com/sourmash-bio/sourmash.git", branch = "refactor_rs_downsample", features = ["branchwater"] } +sourmash = { version = "0.16.0", features = ["branchwater"] } serde_json = "1.0.128" niffler = "2.4.0" log = "0.4.22" From b7d1f5308ce6a8143596330ef4a6372e23334e27 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 15 Oct 2024 14:44:30 -0400 Subject: [PATCH 11/11] fix mambaforge -> miniforge --- .github/workflows/build-test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 51041d65..c9fac079 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -29,7 +29,6 @@ jobs: auto-update-conda: true python-version: 3.12 channels: conda-forge,bioconda - miniforge-variant: Mambaforge miniforge-version: latest use-mamba: true mamba-version: "*"