Skip to content

Commit 2adb639

Browse files
Andrew Duffya10y
authored andcommitted
basic fsst+ beginnings
Signed-off-by: Andrew Duffy <[email protected]>
1 parent 03508f9 commit 2adb639

File tree

2 files changed

+189
-0
lines changed

2 files changed

+189
-0
lines changed

encodings/fsst/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ mod ops;
1919
mod serde;
2020
#[cfg(test)]
2121
mod tests;
22+
mod prefix;
2223

2324
pub use array::*;
2425
pub use compress::*;

encodings/fsst/src/prefix.rs

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
//! An implementation of the FSST+ extension, original at https://github.com/cwida/fsst_plus/.
2+
//!
3+
//! FSST+ augments FSST with the addition.
4+
5+
#![allow(unused)]
6+
7+
#[derive(Debug)]
8+
pub struct SimilarityBlock {
9+
start_idx: usize,
10+
prefix_len: usize,
11+
}
12+
13+
/// Maximum shared prefix length.
14+
pub const MAX_PREFIX: usize = 128;
15+
16+
/// Find the longest-common-prefix between adjacent encoded texts.
17+
pub fn longest_common_prefix(codes: &[&[u8]]) -> Vec<usize> {
18+
// LCP for each pair of successive strings.
19+
// LCP[i] is the length of longest common prefix between the i and i+1 string.
20+
// For example, lcp(["abc", "abcd", "ab"]) -> [3, 2]
21+
let mut longest_common_prefix = Vec::new();
22+
23+
// Calculate the LCP of consecutive strings in the input.
24+
for w in codes.windows(2) {
25+
let s1 = w[0];
26+
let s2 = w[1];
27+
28+
// Consecutive strings to evaluate LCP
29+
let mut lcp = 0;
30+
for (&a, &b) in s1.iter().zip(s2.iter()) {
31+
if a == b {
32+
lcp += 1;
33+
} else {
34+
break;
35+
}
36+
}
37+
38+
longest_common_prefix.push(lcp);
39+
}
40+
41+
longest_common_prefix
42+
}
43+
44+
/// Input: a vector of FSST-encoded strings.
45+
/// Output: a vector of "similarity blocks".
46+
///
47+
/// We first calculate the LCP between adjacent strings, then once that has been completed, we find
48+
/// the optimal split points to create "similarity" blocks that all share a maximal prefix.
49+
///
50+
///
51+
/// The simple recursive solution structure looks something like this:
52+
///
53+
/// ```python
54+
/// def cost(i, j, min_lcp, lcp):
55+
/// # find the min prefix between i and j
56+
/// min_prefix = min_lcp[j][i]
57+
/// block_assignment = []
58+
///
59+
/// # figure out cost of associating i with i-1
60+
///
61+
/// # determine if grouping strings i/j is advantageous.
62+
/// for prefix_len in [0, min_prefix]:
63+
/// n_strings = i - j
64+
/// # if the strings are
65+
/// ```
66+
#[allow(clippy::needless_range_loop)]
67+
pub fn chunk_by_similarity(strings: &[&[u8]]) -> Vec<SimilarityBlock> {
68+
// Calculate LCP between all items first
69+
let lcp = longest_common_prefix(strings);
70+
71+
// min_lcp[i][j] = min(LCP[i], LCP[i+1], ..., LCP[j])
72+
let mut min_lcp = vec![vec![0; strings.len()]; strings.len()];
73+
74+
// ... the diagonals are just the LCP[i]'s
75+
for i in 0..strings.len() {
76+
// up to 128 is longest prefix we allow.
77+
min_lcp[i][i] = strings[i].len().min(MAX_PREFIX);
78+
for j in (i + 1)..lcp.len() {
79+
min_lcp[i][j] = min_lcp[i][j - 1].min(lcp[j - 1]);
80+
}
81+
}
82+
83+
84+
// Cost is the total cost of the block split.
85+
let mut cost = vec![usize::MAX; 1+strings.len()];
86+
cost[0] = 0;
87+
88+
let mut block = vec![0; 1+strings.len()];
89+
let mut prefix = vec![0; 1+strings.len()];
90+
91+
// Estimate the cost for all strings instead here.
92+
let estimate_cost = |i: u32, j: u32| min_lcp[i as usize][j as usize];
93+
94+
// cum_len[k] = sum(strings[i].len() for i in 0..k)
95+
let mut length_prefix_sum = Vec::new();
96+
length_prefix_sum.push(0);
97+
for string in strings {
98+
length_prefix_sum.push(length_prefix_sum.last().unwrap() + string.len());
99+
}
100+
101+
// cost[j] holds the cost up to j.
102+
// block[j] holds the start of the block that j is a part of
103+
// prefix[j] is the prefix length for the j-th string
104+
for end in 1..=strings.len() {
105+
for start in 0..end {
106+
dbg!((start, end));
107+
let min_prefix = min_lcp[start][end - 1];
108+
dbg!(min_prefix);
109+
for prefix_len in [0, min_prefix] {
110+
dbg!(prefix_len);
111+
let n = end - start;
112+
dbg!(n);
113+
let per_string_overhead = if prefix_len == 0 { 1 } else { 3 };
114+
dbg!(per_string_overhead);
115+
let overhead = n * per_string_overhead;
116+
dbg!(overhead);
117+
let sum_len = length_prefix_sum[end] - length_prefix_sum[start];
118+
dbg!(sum_len);
119+
let total_cost = cost[start] + overhead + sum_len - ((n - 1) * prefix_len as usize);
120+
dbg!(total_cost);
121+
122+
if total_cost < cost[end] {
123+
cost[end] = total_cost;
124+
block[end] = start;
125+
prefix[end] = prefix_len;
126+
}
127+
}
128+
}
129+
}
130+
131+
// Traverse the blocks from end to start.
132+
let mut blocks = Vec::new();
133+
134+
let mut idx = strings.len();
135+
while idx > 0 {
136+
let start_idx = block[idx];
137+
let prefix_len = prefix[idx];
138+
139+
blocks.push(SimilarityBlock {
140+
start_idx, prefix_len,
141+
});
142+
143+
// Advance backward.
144+
idx = start_idx;
145+
}
146+
147+
// Reverse the list of blocks so they're in order now.
148+
blocks.reverse();
149+
150+
blocks
151+
}
152+
153+
#[cfg(test)]
154+
mod tests {
155+
use fsst::Compressor;
156+
157+
use crate::prefix::{chunk_by_similarity, longest_common_prefix};
158+
159+
#[test]
160+
fn test_urls() {
161+
let strings = vec![
162+
"reddit.com".as_bytes(),
163+
"reddit.com/a".as_bytes(),
164+
"reddit.com/a/b".as_bytes(),
165+
"reddit.com/c".as_bytes(),
166+
"reddit.com/c/d/d".as_bytes(),
167+
"reddit.com/c/e".as_bytes(),
168+
"google.com".as_bytes(),
169+
"google.com/search?q=beans".as_bytes(),
170+
"google.com/search?q=black+beans".as_bytes(),
171+
"google.com/search?q=lima+beans".as_bytes(),
172+
"google.com/search?q=taylor+swift".as_bytes(),
173+
];
174+
175+
let compressor = Compressor::train(&strings);
176+
let result = compressor.compress_bulk(&strings);
177+
178+
let lcps = longest_common_prefix(strings.as_slice());
179+
assert_eq!(lcps, vec![10, 12, 11, 12, 13, 0, 10, 21, 20, 20]);
180+
181+
let chunks = chunk_by_similarity(&strings);
182+
183+
dbg!(&chunks);
184+
185+
// Once we have calculate the adjacent_lcp, expand into a new LCP with a bunch of strings
186+
// in a single block.
187+
}
188+
}

0 commit comments

Comments
 (0)