|
| 1 | +//! An implementation of the FSST+ extension, original at https://github.com/cwida/fsst_plus/. |
| 2 | +//! |
| 3 | +//! FSST+ augments FSST with the addition. |
| 4 | +
|
| 5 | +#![allow(unused)] |
| 6 | + |
| 7 | +#[derive(Debug)] |
| 8 | +pub struct SimilarityBlock { |
| 9 | + start_idx: usize, |
| 10 | + prefix_len: usize, |
| 11 | +} |
| 12 | + |
| 13 | +/// Maximum shared prefix length. |
| 14 | +pub const MAX_PREFIX: usize = 128; |
| 15 | + |
| 16 | +/// Find the longest-common-prefix between adjacent encoded texts. |
| 17 | +pub fn longest_common_prefix(codes: &[&[u8]]) -> Vec<usize> { |
| 18 | + // LCP for each pair of successive strings. |
| 19 | + // LCP[i] is the length of longest common prefix between the i and i+1 string. |
| 20 | + // For example, lcp(["abc", "abcd", "ab"]) -> [3, 2] |
| 21 | + let mut longest_common_prefix = Vec::new(); |
| 22 | + |
| 23 | + // Calculate the LCP of consecutive strings in the input. |
| 24 | + for w in codes.windows(2) { |
| 25 | + let s1 = w[0]; |
| 26 | + let s2 = w[1]; |
| 27 | + |
| 28 | + // Consecutive strings to evaluate LCP |
| 29 | + let mut lcp = 0; |
| 30 | + for (&a, &b) in s1.iter().zip(s2.iter()) { |
| 31 | + if a == b { |
| 32 | + lcp += 1; |
| 33 | + } else { |
| 34 | + break; |
| 35 | + } |
| 36 | + } |
| 37 | + |
| 38 | + longest_common_prefix.push(lcp); |
| 39 | + } |
| 40 | + |
| 41 | + longest_common_prefix |
| 42 | +} |
| 43 | + |
| 44 | +/// Input: a vector of FSST-encoded strings. |
| 45 | +/// Output: a vector of "similarity blocks". |
| 46 | +/// |
| 47 | +/// We first calculate the LCP between adjacent strings, then once that has been completed, we find |
| 48 | +/// the optimal split points to create "similarity" blocks that all share a maximal prefix. |
| 49 | +/// |
| 50 | +/// |
| 51 | +/// The simple recursive solution structure looks something like this: |
| 52 | +/// |
| 53 | +/// ```python |
| 54 | +/// def cost(i, j, min_lcp, lcp): |
| 55 | +/// # find the min prefix between i and j |
| 56 | +/// min_prefix = min_lcp[j][i] |
| 57 | +/// block_assignment = [] |
| 58 | +/// |
| 59 | +/// # figure out cost of associating i with i-1 |
| 60 | +/// |
| 61 | +/// # determine if grouping strings i/j is advantageous. |
| 62 | +/// for prefix_len in [0, min_prefix]: |
| 63 | +/// n_strings = i - j |
| 64 | +/// # if the strings are |
| 65 | +/// ``` |
| 66 | +#[allow(clippy::needless_range_loop)] |
| 67 | +pub fn chunk_by_similarity(strings: &[&[u8]]) -> Vec<SimilarityBlock> { |
| 68 | + // Calculate LCP between all items first |
| 69 | + let lcp = longest_common_prefix(strings); |
| 70 | + |
| 71 | + // min_lcp[i][j] = min(LCP[i], LCP[i+1], ..., LCP[j]) |
| 72 | + let mut min_lcp = vec![vec![0; strings.len()]; strings.len()]; |
| 73 | + |
| 74 | + // ... the diagonals are just the LCP[i]'s |
| 75 | + for i in 0..strings.len() { |
| 76 | + // up to 128 is longest prefix we allow. |
| 77 | + min_lcp[i][i] = strings[i].len().min(MAX_PREFIX); |
| 78 | + for j in (i + 1)..lcp.len() { |
| 79 | + min_lcp[i][j] = min_lcp[i][j - 1].min(lcp[j - 1]); |
| 80 | + } |
| 81 | + } |
| 82 | + |
| 83 | + |
| 84 | + // Cost is the total cost of the block split. |
| 85 | + let mut cost = vec![usize::MAX; 1+strings.len()]; |
| 86 | + cost[0] = 0; |
| 87 | + |
| 88 | + let mut block = vec![0; 1+strings.len()]; |
| 89 | + let mut prefix = vec![0; 1+strings.len()]; |
| 90 | + |
| 91 | + // Estimate the cost for all strings instead here. |
| 92 | + let estimate_cost = |i: u32, j: u32| min_lcp[i as usize][j as usize]; |
| 93 | + |
| 94 | + // cum_len[k] = sum(strings[i].len() for i in 0..k) |
| 95 | + let mut length_prefix_sum = Vec::new(); |
| 96 | + length_prefix_sum.push(0); |
| 97 | + for string in strings { |
| 98 | + length_prefix_sum.push(length_prefix_sum.last().unwrap() + string.len()); |
| 99 | + } |
| 100 | + |
| 101 | + // cost[j] holds the cost up to j. |
| 102 | + // block[j] holds the start of the block that j is a part of |
| 103 | + // prefix[j] is the prefix length for the j-th string |
| 104 | + for end in 1..=strings.len() { |
| 105 | + for start in 0..end { |
| 106 | + dbg!((start, end)); |
| 107 | + let min_prefix = min_lcp[start][end - 1]; |
| 108 | + dbg!(min_prefix); |
| 109 | + for prefix_len in [0, min_prefix] { |
| 110 | + dbg!(prefix_len); |
| 111 | + let n = end - start; |
| 112 | + dbg!(n); |
| 113 | + let per_string_overhead = if prefix_len == 0 { 1 } else { 3 }; |
| 114 | + dbg!(per_string_overhead); |
| 115 | + let overhead = n * per_string_overhead; |
| 116 | + dbg!(overhead); |
| 117 | + let sum_len = length_prefix_sum[end] - length_prefix_sum[start]; |
| 118 | + dbg!(sum_len); |
| 119 | + let total_cost = cost[start] + overhead + sum_len - ((n - 1) * prefix_len as usize); |
| 120 | + dbg!(total_cost); |
| 121 | + |
| 122 | + if total_cost < cost[end] { |
| 123 | + cost[end] = total_cost; |
| 124 | + block[end] = start; |
| 125 | + prefix[end] = prefix_len; |
| 126 | + } |
| 127 | + } |
| 128 | + } |
| 129 | + } |
| 130 | + |
| 131 | + // Traverse the blocks from end to start. |
| 132 | + let mut blocks = Vec::new(); |
| 133 | + |
| 134 | + let mut idx = strings.len(); |
| 135 | + while idx > 0 { |
| 136 | + let start_idx = block[idx]; |
| 137 | + let prefix_len = prefix[idx]; |
| 138 | + |
| 139 | + blocks.push(SimilarityBlock { |
| 140 | + start_idx, prefix_len, |
| 141 | + }); |
| 142 | + |
| 143 | + // Advance backward. |
| 144 | + idx = start_idx; |
| 145 | + } |
| 146 | + |
| 147 | + // Reverse the list of blocks so they're in order now. |
| 148 | + blocks.reverse(); |
| 149 | + |
| 150 | + blocks |
| 151 | +} |
| 152 | + |
| 153 | +#[cfg(test)] |
| 154 | +mod tests { |
| 155 | + use fsst::Compressor; |
| 156 | + |
| 157 | + use crate::prefix::{chunk_by_similarity, longest_common_prefix}; |
| 158 | + |
| 159 | + #[test] |
| 160 | + fn test_urls() { |
| 161 | + let strings = vec![ |
| 162 | + "reddit.com".as_bytes(), |
| 163 | + "reddit.com/a".as_bytes(), |
| 164 | + "reddit.com/a/b".as_bytes(), |
| 165 | + "reddit.com/c".as_bytes(), |
| 166 | + "reddit.com/c/d/d".as_bytes(), |
| 167 | + "reddit.com/c/e".as_bytes(), |
| 168 | + "google.com".as_bytes(), |
| 169 | + "google.com/search?q=beans".as_bytes(), |
| 170 | + "google.com/search?q=black+beans".as_bytes(), |
| 171 | + "google.com/search?q=lima+beans".as_bytes(), |
| 172 | + "google.com/search?q=taylor+swift".as_bytes(), |
| 173 | + ]; |
| 174 | + |
| 175 | + let compressor = Compressor::train(&strings); |
| 176 | + let result = compressor.compress_bulk(&strings); |
| 177 | + |
| 178 | + let lcps = longest_common_prefix(strings.as_slice()); |
| 179 | + assert_eq!(lcps, vec![10, 12, 11, 12, 13, 0, 10, 21, 20, 20]); |
| 180 | + |
| 181 | + let chunks = chunk_by_similarity(&strings); |
| 182 | + |
| 183 | + dbg!(&chunks); |
| 184 | + |
| 185 | + // Once we have calculate the adjacent_lcp, expand into a new LCP with a bunch of strings |
| 186 | + // in a single block. |
| 187 | + } |
| 188 | +} |
0 commit comments