From c395070612040351a8db07a00513dcd5f8f51bd3 Mon Sep 17 00:00:00 2001 From: Ryan Roelke Date: Mon, 30 Sep 2024 21:07:27 -0400 Subject: [PATCH] Add function to test Cells to identify groups of adjacent equal keys (#171) * Config common option methods use reference * Add Cells::identify_groups --- tiledb/api/src/array/mod.rs | 4 +- tiledb/api/src/config.rs | 7 ++- tiledb/api/src/query/strategy.rs | 76 ++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/tiledb/api/src/array/mod.rs b/tiledb/api/src/array/mod.rs index 776322de..2303a62f 100644 --- a/tiledb/api/src/array/mod.rs +++ b/tiledb/api/src/array/mod.rs @@ -1507,7 +1507,7 @@ pub mod tests { let uri = { let context = { let mut config = Config::new()?; - config.set_common_option(key_config.clone())?; + config.set_common_option(&key_config)?; Context::from_config(&config) }?; @@ -1541,7 +1541,7 @@ pub mod tests { { let context = Context::new()?; let array_config = - Config::new()?.with_common_option(key_config.clone())?; + Config::new()?.with_common_option(&key_config)?; let _ = ArrayOpener::new(&context, &uri, Mode::Read)? .config(&array_config)? diff --git a/tiledb/api/src/config.rs b/tiledb/api/src/config.rs index 83c84a71..60970ef6 100644 --- a/tiledb/api/src/config.rs +++ b/tiledb/api/src/config.rs @@ -132,11 +132,14 @@ impl Config { } } - pub fn set_common_option(&mut self, opt: CommonOption) -> TileDBResult<()> { + pub fn set_common_option( + &mut self, + opt: &CommonOption, + ) -> TileDBResult<()> { opt.apply(self) } - pub fn with_common_option(self, opt: CommonOption) -> TileDBResult { + pub fn with_common_option(self, opt: &CommonOption) -> TileDBResult { let mut s = self; s.set_common_option(opt)?; Ok(s) diff --git a/tiledb/api/src/query/strategy.rs b/tiledb/api/src/query/strategy.rs index 74844f00..e5e00533 100644 --- a/tiledb/api/src/query/strategy.rs +++ b/tiledb/api/src/query/strategy.rs @@ -972,6 +972,35 @@ impl Cells { sorted } + /// Returns the list of offsets beginning each group, i.e. run of contiguous values on `keys`. + /// + /// This is best used with sorted cells, but that is not required. + /// For each pair of offsets in the output, all cells in that index range are equal; + /// and the adjacent cells outside of the range are not equal. + pub fn identify_groups(&self, keys: &[String]) -> Option> { + if self.is_empty() { + return None; + } + let mut groups = vec![0]; + let mut icmp = 0; + for i in 1..self.len() { + let distinct = keys.iter().any(|k| { + let v = self.fields().get(k).unwrap(); + typed_field_data_go!( + v, + ref cells, + cells[i].bits_ne(&cells[icmp]) + ) + }); + if distinct { + groups.push(i); + icmp = i; + } + } + groups.push(self.len()); + Some(groups) + } + /// Returns the number of distinct values grouped on `keys` pub fn count_distinct(&self, keys: &[String]) -> usize { if self.len() <= 1 { @@ -1889,6 +1918,43 @@ mod tests { } } + /// Assert that the output of [Cells::identify_groups] produces + /// correct output for the given `keys`. + fn do_cells_identify_groups(cells: Cells, keys: &[String]) { + let Some(actual) = cells.identify_groups(keys) else { + assert!(cells.is_empty()); + return; + }; + + for w in actual.windows(2) { + let (start, end) = (w[0], w[1]); + assert!(start < end); + } + + for w in actual.windows(2) { + let (start, end) = (w[0], w[1]); + for k in keys.iter() { + let f = cells.fields().get(k).unwrap(); + typed_field_data_go!(f, ref field_cells, { + for i in start..end { + assert!(field_cells[start].bits_eq(&field_cells[i])); + } + }) + } + if end < cells.len() { + let some_ne = keys.iter().any(|k| { + let f = cells.fields().get(k).unwrap(); + typed_field_data_go!(f, ref field_cells, { + field_cells[start].bits_ne(&field_cells[end]) + }) + }); + assert!(some_ne); + } + } + + assert_eq!(Some(cells.len()), actual.last().copied()); + } + fn do_cells_count_distinct_1d(cells: Cells) { for (key, field_cells) in cells.fields().iter() { let expect_count = @@ -2107,6 +2173,16 @@ mod tests { do_cells_slice_3d(cells, d1, d2, d3, s1, s2, s3) } + #[test] + fn cells_identify_groups((cells, keys) in any::().prop_flat_map(|c| { + let keys = c.fields().keys().cloned().collect::>(); + let nkeys = keys.len(); + (Just(c), proptest::sample::subsequence(keys, 0..=nkeys)) + })) + { + do_cells_identify_groups(cells, &keys) + } + #[test] fn cells_count_distinct_1d(cells in any::()) { do_cells_count_distinct_1d(cells)