Skip to content

Commit 4a02738

Browse files
committed
simplify API and add slider post-processing
1 parent 1505ee1 commit 4a02738

18 files changed

+1453
-491
lines changed

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "imara-diff"
3-
version = "0.1.8"
3+
version = "0.2.0"
44
edition = "2021"
55
authors = ["pascalkuthe <[email protected]>"]
66
rust-version = "1.71"
@@ -18,6 +18,7 @@ exclude = [
1818

1919
[dependencies]
2020
hashbrown = { version = "0.15", default-features = false, features = ["default-hasher", "inline-more"] }
21+
memchr = "2.7.4"
2122

2223
[features]
2324
default = ["unified_diff"]

src/histogram.rs

Lines changed: 20 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
use std::ops::Range;
2-
31
use crate::histogram::lcs::find_lcs;
42
use crate::histogram::list_pool::{ListHandle, ListPool};
53
use crate::intern::Token;
6-
use crate::util::{strip_common_postfix, strip_common_prefix};
7-
use crate::{myers, Sink};
4+
use crate::myers;
85

96
mod lcs;
107
mod list_pool;
@@ -16,17 +13,15 @@ struct Histogram {
1613
pool: ListPool,
1714
}
1815

19-
pub fn diff<S: Sink>(
20-
mut before: &[Token],
21-
mut after: &[Token],
16+
pub fn diff(
17+
before: &[Token],
18+
after: &[Token],
19+
removed: &mut [bool],
20+
added: &mut [bool],
2221
num_tokens: u32,
23-
mut sink: S,
24-
) -> S::Out {
22+
) {
2523
let mut histogram = Histogram::new(num_tokens);
26-
let prefix = strip_common_prefix(&mut before, &mut after);
27-
strip_common_postfix(&mut before, &mut after);
28-
histogram.run(before, prefix, after, prefix, &mut sink);
29-
sink.finish()
24+
histogram.run(before, after, removed, added);
3025
}
3126

3227
impl Histogram {
@@ -58,73 +53,49 @@ impl Histogram {
5853
fn run(
5954
&mut self,
6055
mut before: &[Token],
61-
mut before_off: u32,
6256
mut after: &[Token],
63-
mut after_off: u32,
64-
sink: &mut impl Sink,
57+
mut removed: &mut [bool],
58+
mut added: &mut [bool],
6559
) {
6660
loop {
6761
if before.is_empty() {
68-
if !after.is_empty() {
69-
sink.process_change(
70-
before_off..before_off,
71-
after_off..after_off + after.len() as u32,
72-
);
73-
}
62+
added.fill(true);
7463
return;
7564
} else if after.is_empty() {
76-
sink.process_change(
77-
before_off..before_off + before.len() as u32,
78-
after_off..after_off,
79-
);
65+
removed.fill(true);
8066
return;
8167
}
8268

8369
self.populate(before);
8470
match find_lcs(before, after, self) {
8571
// no lcs was found, that means that file1 and file2 two have nothing in common
8672
Some(lcs) if lcs.len == 0 => {
87-
sink.process_change(
88-
before_off..before_off + before.len() as u32,
89-
after_off..after_off + after.len() as u32,
90-
);
73+
added.fill(true);
74+
removed.fill(true);
9175
return;
9276
}
9377
Some(lcs) => {
9478
self.run(
9579
&before[..lcs.before_start as usize],
96-
before_off,
9780
&after[..lcs.after_start as usize],
98-
after_off,
99-
sink,
81+
&mut removed[..lcs.before_start as usize],
82+
&mut added[..lcs.after_start as usize],
10083
);
10184

10285
// this is equivalent to (tail) recursion but implement as a loop for efficeny reasons
10386
let before_end = lcs.before_start + lcs.len;
10487
before = &before[before_end as usize..];
105-
before_off += before_end;
88+
removed = &mut removed[before_end as usize..];
10689

10790
let after_end = lcs.after_start + lcs.len;
10891
after = &after[after_end as usize..];
109-
after_off += after_end;
92+
added = &mut added[after_end as usize..];
11093
}
11194
None => {
11295
// we are diffing two extremely large repetitive files
11396
// this is a worst case for histogram diff with O(N^2) performance
114-
// fallback to myers to maintain linear time complexity
115-
myers::diff(
116-
before,
117-
after,
118-
0, // not used by myers
119-
|mut before: Range<u32>, mut after: Range<u32>| {
120-
before.start += before_off;
121-
before.end += before_off;
122-
after.start += after_off;
123-
after.end += after_off;
124-
sink.process_change(before, after)
125-
},
126-
false,
127-
);
97+
// fallback to myers to maintain linear time complxity
98+
myers::diff(before, after, removed, added, false);
12899
return;
129100
}
130101
}

src/histogram/lcs.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@ impl LcsSearch {
8888

8989
let mut end1 = token_idx1 + 1;
9090
let mut end2 = after_pos + 1;
91-
9291
loop {
9392
let token1 = before.get(end1 as usize);
9493
let token2 = after.get(end2 as usize);

src/intern.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ pub trait TokenSource {
3636
fn estimate_tokens(&self) -> u32;
3737
}
3838

39-
/// Two lists of interned [tokens](crate::intern::Token) that can be compared with the [`diff`](crate::diff) function.
39+
/// Two lists of interned [tokens](crate::intern::Token) that a [`Diff`](crate::Diff) can be computed from.
4040
///
4141
/// A token represents the smallest possible unit of change during a diff.
4242
/// For text this is usually a line, a word or a single character.
@@ -74,6 +74,23 @@ impl<T: Eq + Hash> InternedInput<T> {
7474
res
7575
}
7676

77+
/// Create an Interner with an intial capacity calculated by calling
78+
/// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`
79+
pub fn reserve_for_token_source<S: TokenSource<Token = T> + ?Sized>(
80+
&mut self,
81+
before: &S,
82+
after: &S,
83+
) {
84+
self.reserve(before.estimate_tokens(), after.estimate_tokens())
85+
}
86+
87+
pub fn reserve(&mut self, capacity_before: u32, capacity_after: u32) {
88+
self.before.reserve(capacity_before as usize);
89+
self.after.reserve(capacity_after as usize);
90+
self.interner
91+
.reserve(capacity_before as usize + capacity_after as usize);
92+
}
93+
7794
/// replaces `self.before` with the interned Tokens yielded by `input`
7895
/// Note that this does not erase any tokens from the interner and might therefore be considered
7996
/// a memory leak. If this function is called often over a long_running process
@@ -133,6 +150,19 @@ impl<T> Interner<T> {
133150
}
134151

135152
impl<T: Hash + Eq> Interner<T> {
153+
/// Create an Interner with an intial capacity calculated by calling
154+
/// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`
155+
pub fn reserve_for_token_source<S: TokenSource<Token = T>>(&mut self, before: &S, after: &S) {
156+
self.reserve(before.estimate_tokens() as usize + after.estimate_tokens() as usize)
157+
}
158+
159+
pub fn reserve(&mut self, capacity: usize) {
160+
self.table.reserve(capacity, |&token| {
161+
self.hasher.hash_one(&self.tokens[token.0 as usize])
162+
});
163+
self.tokens.reserve(capacity);
164+
}
165+
136166
/// Intern `token` and return a the interned integer.
137167
pub fn intern(&mut self, token: T) -> Token {
138168
let hash = self.hasher.hash_one(&token);

0 commit comments

Comments
 (0)