From 35df166062e5caa8e40e2a0ce37efecb2d86813b Mon Sep 17 00:00:00 2001 From: calyptobai Date: Wed, 31 Jan 2024 19:06:12 -0500 Subject: [PATCH] Fix long queries at `/q` This fixes code search against repos where the name is longer than 5 characters. The general strategy here is to bail out of case permutations if any tokens in the generated token stream are too long. This means that for a query like `repo:foobar quux`, the `quux` portion will match in a case-insensitive fashion, while the repo `foobar` must match case exactly. --- server/bleep/src/query/compiler.rs | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/server/bleep/src/query/compiler.rs b/server/bleep/src/query/compiler.rs index 97920efb00..993a4b923d 100644 --- a/server/bleep/src/query/compiler.rs +++ b/server/bleep/src/query/compiler.rs @@ -19,6 +19,8 @@ use crate::query::{ planner, }; +const MAX_CASE_PERMUTATION_LEN: usize = 5; + type DynQuery = Box; enum Extraction<'a> { @@ -106,12 +108,20 @@ impl Compiler { let mut token_stream = tokenizer.token_stream(&text); let tokens = std::iter::from_fn(move || { token_stream.next().map(|tok| CompactString::new(&tok.text)) - }); + }) + .collect::>(); - let terms = if query.is_case_sensitive() { - tokens.map(|s| str_to_query(*field, &s)).collect::>() + // We skip case insensitive matching if a token + let terms = if query.is_case_sensitive() + || tokens.iter().any(|t| t.len() > MAX_CASE_PERMUTATION_LEN) + { + tokens + .into_iter() + .map(|s| str_to_query(*field, &s)) + .collect::>() } else { tokens + .into_iter() .map(|s| { let terms = case_permutations(&s) .map(|s| str_to_query(*field, &s)) @@ -253,9 +263,8 @@ pub fn case_permutations(s: &str) -> impl Iterator { .map(|c| c.to_ascii_lowercase()) .collect::>(); - // Make sure not to overflow. The end condition is a mask with the highest bit set, and we use - // `u32` masks. - debug_assert!(chars.len() <= 5); + // A string that is too long leads to an exponential explosion here, growing with 2^n. + debug_assert!(chars.len() <= MAX_CASE_PERMUTATION_LEN); let num_chars = chars.len();