Skip to content

Commit e87b446

Browse files
committed
Replace an ASCII space check with a is_whitespace check in Atom::new
This also fixes an unrelated bug when parsing needles which contain non-ASCI Unicode.
1 parent 387b17c commit e87b446

File tree

2 files changed

+69
-29
lines changed

2 files changed

+69
-29
lines changed

Diff for: matcher/src/pattern.rs

+49-29
Original file line numberDiff line numberDiff line change
@@ -122,17 +122,30 @@ impl Atom {
122122
normalize = false;
123123
}
124124
let needle = if needle.is_ascii() {
125-
let mut needle = if escape_whitespace {
126-
if let Some((start, rem)) = needle.split_once("\\ ") {
127-
let mut needle = start.to_owned();
128-
for rem in rem.split("\\ ") {
129-
needle.push(' ');
130-
needle.push_str(rem);
125+
let mut needle_string = if escape_whitespace {
126+
let mut needle_bytes = Vec::with_capacity(needle.len());
127+
let mut saw_backslash = false;
128+
for c in needle.bytes() {
129+
if saw_backslash {
130+
if c.is_ascii_whitespace() {
131+
needle_bytes.push(c);
132+
saw_backslash = false;
133+
continue;
134+
} else {
135+
needle_bytes.push(b'\\');
136+
}
131137
}
132-
needle
133-
} else {
134-
needle.to_owned()
138+
saw_backslash = c == b'\\';
139+
if !saw_backslash {
140+
needle_bytes.push(c);
141+
}
142+
}
143+
// push the potentially trailing backslash
144+
if saw_backslash {
145+
needle_bytes.push(b'\\');
135146
}
147+
// SAFETY: we just checked that needle is ascii, so each `c` is a valid ASCII byte
148+
unsafe { String::from_utf8_unchecked(needle_bytes) }
136149
} else {
137150
needle.to_owned()
138151
};
@@ -141,18 +154,19 @@ impl Atom {
141154
#[cfg(feature = "unicode-casefold")]
142155
CaseMatching::Ignore => {
143156
ignore_case = true;
144-
needle.make_ascii_lowercase()
157+
needle_string.make_ascii_lowercase()
145158
}
146159
#[cfg(feature = "unicode-casefold")]
147160
CaseMatching::Smart => {
148-
ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
161+
ignore_case = !needle_string.bytes().any(|b| b.is_ascii_uppercase())
149162
}
150163
CaseMatching::Respect => ignore_case = false,
151164
}
165+
152166
if append_dollar {
153-
needle.push('$');
167+
needle_string.push('$');
154168
}
155-
Utf32String::Ascii(needle.into_boxed_str())
169+
Utf32String::Ascii(needle_string.into_boxed_str())
156170
} else {
157171
let mut needle_ = Vec::with_capacity(needle.len());
158172
#[cfg(feature = "unicode-casefold")]
@@ -171,32 +185,38 @@ impl Atom {
171185
let mut saw_backslash = false;
172186
for mut c in chars::graphemes(needle) {
173187
if saw_backslash {
174-
if c == ' ' {
175-
needle_.push(' ');
188+
if c.is_whitespace() {
189+
needle_.push(c);
176190
saw_backslash = false;
177191
continue;
178192
} else {
179193
needle_.push('\\');
180194
}
181195
}
182196
saw_backslash = c == '\\';
183-
match case {
184-
#[cfg(feature = "unicode-casefold")]
185-
CaseMatching::Ignore => c = chars::to_lower_case(c),
186-
#[cfg(feature = "unicode-casefold")]
187-
CaseMatching::Smart => {
188-
ignore_case = ignore_case && !chars::is_upper_case(c)
197+
if !saw_backslash {
198+
match case {
199+
#[cfg(feature = "unicode-casefold")]
200+
CaseMatching::Ignore => c = chars::to_lower_case(c),
201+
#[cfg(feature = "unicode-casefold")]
202+
CaseMatching::Smart => {
203+
ignore_case = ignore_case && !chars::is_upper_case(c)
204+
}
205+
CaseMatching::Respect => (),
189206
}
190-
CaseMatching::Respect => (),
191-
}
192-
match normalization {
193-
#[cfg(feature = "unicode-normalization")]
194-
Normalization::Smart => {
195-
normalize = normalize && chars::normalize(c) == c;
207+
match normalization {
208+
#[cfg(feature = "unicode-normalization")]
209+
Normalization::Smart => {
210+
normalize = normalize && chars::normalize(c) == c;
211+
}
212+
Normalization::Never => (),
196213
}
197-
Normalization::Never => (),
214+
needle_.push(c);
198215
}
199-
needle_.push(c);
216+
}
217+
// push the potentially trailing backslash
218+
if saw_backslash {
219+
needle_.push('\\');
200220
}
201221
} else {
202222
let chars = chars::graphemes(needle).map(|mut c| {

Diff for: matcher/src/pattern/tests.rs

+20
Original file line numberDiff line numberDiff line change
@@ -85,8 +85,28 @@ fn case_matching() {
8585

8686
#[test]
8787
fn escape() {
88+
// escapes only impact whitespace
8889
let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart);
8990
assert_eq!(pat.needle.to_string(), "foo bar");
91+
let pat = Atom::parse("foo\\\tbar", CaseMatching::Smart, Normalization::Smart);
92+
assert_eq!(pat.needle.to_string(), "foo\tbar");
93+
let pat = Atom::parse("\\", CaseMatching::Smart, Normalization::Smart);
94+
assert_eq!(pat.needle.to_string(), "\\");
95+
let pat = Atom::parse("\\\\", CaseMatching::Smart, Normalization::Smart);
96+
assert_eq!(pat.needle.to_string(), "\\\\");
97+
98+
// some unicode checks
99+
let pat = Atom::parse("foö\\ bar", CaseMatching::Smart, Normalization::Smart);
100+
assert_eq!(pat.needle.to_string(), "foö bar");
101+
let pat = Atom::parse("foö\\\\ bar", CaseMatching::Smart, Normalization::Smart);
102+
assert_eq!(pat.needle.to_string(), "foö\\ bar");
103+
let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart);
104+
assert_eq!(pat.needle.to_string(), "foo bar"); // double-width IDEOGRAPHIC SPACE
105+
let pat = Atom::parse(\\b", CaseMatching::Smart, Normalization::Smart);
106+
assert_eq!(pat.needle.to_string(), \\b");
107+
let pat = Atom::parse(\\\\", CaseMatching::Smart, Normalization::Smart);
108+
assert_eq!(pat.needle.to_string(), \\\\");
109+
90110
let pat = Atom::parse("\\!foo", CaseMatching::Smart, Normalization::Smart);
91111
assert_eq!(pat.needle.to_string(), "!foo");
92112
assert_eq!(pat.kind, AtomKind::Fuzzy);

0 commit comments

Comments
 (0)