Replace an ASCII space check with a is_whitespace check in Atom::new

alexrutar · alexrutar · commit e87b4467c574 · 2024-12-09T22:09:46.000Z
This also fixes an unrelated bug when parsing needles which contain
non-ASCI Unicode.
diff --git a/matcher/src/pattern.rs b/matcher/src/pattern.rs
@@ -122,17 +122,30 @@ impl Atom {
             normalize = false;
         }
         let needle = if needle.is_ascii() {
-            let mut needle = if escape_whitespace {
-                if let Some((start, rem)) = needle.split_once("\\ ") {
-                    let mut needle = start.to_owned();
-                    for rem in rem.split("\\ ") {
-                        needle.push(' ');
-                        needle.push_str(rem);
+            let mut needle_string = if escape_whitespace {
+                let mut needle_bytes = Vec::with_capacity(needle.len());
+                let mut saw_backslash = false;
+                for c in needle.bytes() {
+                    if saw_backslash {
+                        if c.is_ascii_whitespace() {
+                            needle_bytes.push(c);
+                            saw_backslash = false;
+                            continue;
+                        } else {
+                            needle_bytes.push(b'\\');
+                        }
                     }
-                    needle
-                } else {
-                    needle.to_owned()
+                    saw_backslash = c == b'\\';
+                    if !saw_backslash {
+                        needle_bytes.push(c);
+                    }
+                }
+                // push the potentially trailing backslash
+                if saw_backslash {
+                    needle_bytes.push(b'\\');
                 }
+                // SAFETY: we just checked that needle is ascii, so each `c` is a valid ASCII byte
+                unsafe { String::from_utf8_unchecked(needle_bytes) }
             } else {
                 needle.to_owned()
             };
@@ -141,18 +154,19 @@ impl Atom {
                 #[cfg(feature = "unicode-casefold")]
                 CaseMatching::Ignore => {
                     ignore_case = true;
-                    needle.make_ascii_lowercase()
+                    needle_string.make_ascii_lowercase()
                 }
                 #[cfg(feature = "unicode-casefold")]
                 CaseMatching::Smart => {
-                    ignore_case = !needle.bytes().any(|b| b.is_ascii_uppercase())
+                    ignore_case = !needle_string.bytes().any(|b| b.is_ascii_uppercase())
                 }
                 CaseMatching::Respect => ignore_case = false,
             }
+
             if append_dollar {
-                needle.push('$');
+                needle_string.push('$');
             }
-            Utf32String::Ascii(needle.into_boxed_str())
+            Utf32String::Ascii(needle_string.into_boxed_str())
         } else {
             let mut needle_ = Vec::with_capacity(needle.len());
             #[cfg(feature = "unicode-casefold")]
@@ -171,32 +185,38 @@ impl Atom {
                 let mut saw_backslash = false;
                 for mut c in chars::graphemes(needle) {
                     if saw_backslash {
-                        if c == ' ' {
-                            needle_.push(' ');
+                        if c.is_whitespace() {
+                            needle_.push(c);
                             saw_backslash = false;
                             continue;
                         } else {
                             needle_.push('\\');
                         }
                     }
                     saw_backslash = c == '\\';
-                    match case {
-                        #[cfg(feature = "unicode-casefold")]
-                        CaseMatching::Ignore => c = chars::to_lower_case(c),
-                        #[cfg(feature = "unicode-casefold")]
-                        CaseMatching::Smart => {
-                            ignore_case = ignore_case && !chars::is_upper_case(c)
+                    if !saw_backslash {
+                        match case {
+                            #[cfg(feature = "unicode-casefold")]
+                            CaseMatching::Ignore => c = chars::to_lower_case(c),
+                            #[cfg(feature = "unicode-casefold")]
+                            CaseMatching::Smart => {
+                                ignore_case = ignore_case && !chars::is_upper_case(c)
+                            }
+                            CaseMatching::Respect => (),
                         }
-                        CaseMatching::Respect => (),
-                    }
-                    match normalization {
-                        #[cfg(feature = "unicode-normalization")]
-                        Normalization::Smart => {
-                            normalize = normalize && chars::normalize(c) == c;
+                        match normalization {
+                            #[cfg(feature = "unicode-normalization")]
+                            Normalization::Smart => {
+                                normalize = normalize && chars::normalize(c) == c;
+                            }
+                            Normalization::Never => (),
                         }
-                        Normalization::Never => (),
+                        needle_.push(c);
                     }
-                    needle_.push(c);
+                }
+                // push the potentially trailing backslash
+                if saw_backslash {
+                    needle_.push('\\');
                 }
             } else {
                 let chars = chars::graphemes(needle).map(|mut c| {
diff --git a/matcher/src/pattern/tests.rs b/matcher/src/pattern/tests.rs
@@ -85,8 +85,28 @@ fn case_matching() {
 
 #[test]
 fn escape() {
+    // escapes only impact whitespace
     let pat = Atom::parse("foo\\ bar", CaseMatching::Smart, Normalization::Smart);
     assert_eq!(pat.needle.to_string(), "foo bar");
+    let pat = Atom::parse("foo\\\tbar", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "foo\tbar");
+    let pat = Atom::parse("\\", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "\\");
+    let pat = Atom::parse("\\\\", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "\\\\");
+
+    // some unicode checks
+    let pat = Atom::parse("foö\\ bar", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "foö bar");
+    let pat = Atom::parse("foö\\\\ bar", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "foö\\ bar");
+    let pat = Atom::parse("foo\\　bar", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "foo　bar"); // double-width IDEOGRAPHIC SPACE
+    let pat = Atom::parse("ö\\b", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "ö\\b");
+    let pat = Atom::parse("ö\\\\", CaseMatching::Smart, Normalization::Smart);
+    assert_eq!(pat.needle.to_string(), "ö\\\\");
+
     let pat = Atom::parse("\\!foo", CaseMatching::Smart, Normalization::Smart);
     assert_eq!(pat.needle.to_string(), "!foo");
     assert_eq!(pat.kind, AtomKind::Fuzzy);