refactor: improve sorting logic of search results (#910)

ayangweb · SteveLauC · medcl · web-flow · commit b8affcd4a17c · 2025-10-10T14:32:44.000+08:00
* refactor: improve sorting logic of search results * refactor: update * wip * feat: support switching groups via keyboard shortcuts (#911) * feat: support switching groups via keyboard shortcuts * refactor: update * docs: update changelog * refactor post-querying logic * refactor post-querying logic * refactor post-querying logic * refactor post-querying logic * refactor: refactoring rerank function * refactor: refactoring rerank with intelligent hybrid scorer * chore: remove debug logging * chore: fix format --------- Co-authored-by: Steve Lau <stevelauc@outlook.com> Co-authored-by: medcl <m@medcl.net>
diff --git a/src-tauri/src/search/mod.rs b/src-tauri/src/search/mod.rs
@@ -10,13 +10,10 @@ use function_name::named;
 use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use reqwest::StatusCode;
-use std::cmp::Reverse;
 use std::collections::HashMap;
-use std::collections::HashSet;
 use std::sync::Arc;
 use tauri::{AppHandle, Manager};
 use tokio::time::{Duration, timeout};
-
 #[named]
 #[tauri::command]
 pub async fn query_coco_fusion(
@@ -187,7 +184,6 @@ async fn query_coco_fusion_multi_query_sources(
 
     let mut futures = FuturesUnordered::new();
 
-    let query_source_list_len = query_source_trait_object_list.len();
     for query_source_trait_object in query_source_trait_object_list {
         let query_source = query_source_trait_object.get_type().clone();
         let tauri_app_handle_clone = tauri_app_handle.clone();
@@ -208,14 +204,8 @@ async fn query_coco_fusion_multi_query_sources(
     }
 
     let mut total_hits = 0;
-    let mut need_rerank = true; //TODO set default to false when boost supported in Pizza
     let mut failed_requests = Vec::new();
-    let mut all_hits: Vec<(String, QueryHits, f64)> = Vec::new();
-    let mut hits_per_source: HashMap<String, Vec<(QueryHits, f64)>> = HashMap::new();
-
-    if query_source_list_len > 1 {
-        need_rerank = true; // If we have more than one source, we need to rerank the hits
-    }
+    let mut all_hits_grouped_by_source_id: HashMap<String, Vec<QueryHits>> = HashMap::new();
 
     while let Some((query_source, timeout_result)) = futures.next().await {
         match timeout_result {
@@ -246,12 +236,10 @@ async fn query_coco_fusion_multi_query_sources(
                             document,
                         };
 
-                        all_hits.push((source_id.clone(), query_hit.clone(), score));
-
-                        hits_per_source
+                        all_hits_grouped_by_source_id
                             .entry(source_id.clone())
                             .or_insert_with(Vec::new)
-                            .push((query_hit, score));
+                            .push(query_hit);
                     }
                 }
                 Err(search_error) => {
@@ -267,109 +255,117 @@ async fn query_coco_fusion_multi_query_sources(
         }
     }
 
-    // Sort hits within each source by score (descending)
-    for hits in hits_per_source.values_mut() {
-        hits.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Greater));
-    }
-
-    let total_sources = hits_per_source.len();
-    let max_hits_per_source = if total_sources > 0 {
-        size as usize / total_sources
-    } else {
-        size as usize
-    };
+    let n_sources = all_hits_grouped_by_source_id.len();
 
-    let mut final_hits = Vec::new();
-    let mut seen_docs = HashSet::new(); // To track documents we've already added
-
-    // Distribute hits fairly across sources
-    for (_source_id, hits) in &mut hits_per_source {
-        let take_count = hits.len().min(max_hits_per_source);
-        for (doc, score) in hits.drain(0..take_count) {
-            if !seen_docs.contains(&doc.document.id) {
-                seen_docs.insert(doc.document.id.clone());
-                log::debug!(
-                    "collect doc: {}, {:?}, {}",
-                    doc.document.id,
-                    doc.document.title,
-                    score
-                );
-                final_hits.push(doc);
-            }
-        }
+    if n_sources == 0 {
+        return Ok(MultiSourceQueryResponse {
+            failed: Vec::new(),
+            hits: Vec::new(),
+            total_hits: 0,
+        });
     }
 
-    log::debug!("final hits: {:?}", final_hits.len());
-
-    let mut unique_sources = HashSet::new();
-    for hit in &final_hits {
-        if let Some(source) = &hit.source {
-            if source.id != crate::extension::built_in::calculator::DATA_SOURCE_ID {
-                unique_sources.insert(&source.id);
-            }
-        }
+    /*
+     * Re-rank the hits
+     */
+    if n_sources > 1 {
+        boosted_levenshtein_rerank(&query_keyword, &mut all_hits_grouped_by_source_id);
     }
 
-    log::debug!(
-        "Multiple sources found: {:?}, no rerank needed",
-        unique_sources
-    );
-
-    if unique_sources.len() < 1 {
-        need_rerank = false; // If we have hits from multiple sources, we don't need to rerank
+    /*
+     * Sort hits within each source by score (descending) in case data sources
+     * do not sort them
+     */
+    for hits in all_hits_grouped_by_source_id.values_mut() {
+        hits.sort_by(|a, b| {
+            b.score
+                .partial_cmp(&a.score)
+                .unwrap_or(std::cmp::Ordering::Greater)
+        });
     }
 
-    if need_rerank && final_hits.len() > 1 {
-        // Precollect (index, title)
-        let titles_to_score: Vec<(usize, &str)> = final_hits
-            .iter()
-            .enumerate()
-            .filter_map(|(idx, hit)| {
-                let source = hit.source.as_ref()?;
-                let title = hit.document.title.as_deref()?;
+    /*
+     * Collect hits evenly across sources, to ensure:
+     *
+     * 1. All sources have hits returned
+     * 2. Query sources with many hits won't dominate
+     */
+    let mut final_hits_grouped_by_source_id: HashMap<String, Vec<QueryHits>> = HashMap::new();
+    let mut pruned: HashMap<&str, &[QueryHits]> = HashMap::new();
+
+    // max_hits_per_source could be 0, then `final_hits_grouped_by_source_id`
+    // would be empty. But we don't need to worry about this case as we will
+    // populate hits later.
+    let max_hits_per_source = size as usize / n_sources;
+    for (source_id, hits) in all_hits_grouped_by_source_id.iter() {
+        let hits_taken = if hits.len() > max_hits_per_source {
+            pruned.insert(&source_id, &hits[max_hits_per_source..]);
+            hits[0..max_hits_per_source].to_vec()
+        } else {
+            hits.clone()
+        };
+
+        final_hits_grouped_by_source_id.insert(source_id.clone(), hits_taken);
+    }
 
-                if source.id != crate::extension::built_in::calculator::DATA_SOURCE_ID {
-                    Some((idx, title))
-                } else {
-                    None
+    let final_hits_len = final_hits_grouped_by_source_id
+        .iter()
+        .fold(0, |acc: usize, (_source_id, hits)| acc + hits.len());
+    let pruned_len = pruned
+        .iter()
+        .fold(0, |acc: usize, (_source_id, hits)| acc + hits.len());
+
+    /*
+     * If we still need more hits, take the highest-scoring from `pruned`
+     *
+     * `pruned` contains sorted arrays, we scan it in a way similar to
+     * how n-way-merge-sort extracts the element with the greatest value.
+     */
+    if final_hits_len < size as usize {
+        let n_need = size as usize - final_hits_len;
+        let n_have = pruned_len;
+        let n_take = n_have.min(n_need);
+
+        for _ in 0..n_take {
+            let mut highest_score_hit: Option<(&str, &QueryHits)> = None;
+            for (source_id, sorted_hits) in pruned.iter_mut() {
+                if sorted_hits.is_empty() {
+                    continue;
                 }
-            })
-            .collect();
 
-        // Score them
-        let scored_hits = boosted_levenshtein_rerank(query_keyword.as_str(), titles_to_score);
+                let hit = &sorted_hits[0];
 
-        // Sort descending by score
-        let mut scored_hits = scored_hits;
-        scored_hits.sort_by_key(|&(_, score)| Reverse((score * 1000.0) as u64));
+                let have_higher_score_hit = match highest_score_hit {
+                    Some((_, current_highest_score_hit)) => {
+                        hit.score > current_highest_score_hit.score
+                    }
+                    None => true,
+                };
 
-        // Apply new scores to final_hits
-        for (idx, score) in scored_hits.into_iter().take(size as usize) {
-            final_hits[idx].score = score;
-        }
-    } else if final_hits.len() < size as usize {
-        // If we still need more hits, take the highest-scoring remaining ones
+                if have_higher_score_hit {
+                    highest_score_hit = Some((*source_id, hit));
 
-        let remaining_needed = size as usize - final_hits.len();
+                    // Advance sorted_hits by 1 element, if have
+                    if sorted_hits.len() == 1 {
+                        *sorted_hits = &[];
+                    } else {
+                        *sorted_hits = &sorted_hits[1..];
+                    }
+                }
+            }
 
-        // Sort all hits by score descending, removing duplicates by document ID
-        all_hits.sort_by(|a, b| b.2.partial_cmp(&a.2).unwrap_or(std::cmp::Ordering::Equal));
+            let (source_id, hit) = highest_score_hit.expect("`pruned` should contain at least `n_take` elements so `highest_score_hit` should be set");
 
-        let extra_hits = all_hits
-            .into_iter()
-            .filter(|(source_id, _, _)| hits_per_source.contains_key(source_id)) // Only take from known sources
-            .filter_map(|(_, doc, _)| {
-                if !seen_docs.contains(&doc.document.id) {
-                    seen_docs.insert(doc.document.id.clone());
-                    Some(doc)
-                } else {
-                    None
-                }
-            })
-            .take(remaining_needed)
-            .collect::<Vec<_>>();
+            final_hits_grouped_by_source_id
+                .get_mut(source_id)
+                .expect("all the source_ids stored in `pruned` come from `final_hits_grouped_by_source_id`, so it should exist")
+                .push(hit.clone());
+        }
+    }
 
-        final_hits.extend(extra_hits);
+    let mut final_hits = Vec::new();
+    for (_source_id, hits) in final_hits_grouped_by_source_id {
+        final_hits.extend(hits);
     }
 
     // **Sort final hits by score descending**
@@ -379,6 +375,11 @@ async fn query_coco_fusion_multi_query_sources(
             .unwrap_or(std::cmp::Ordering::Equal)
     });
 
+    // Truncate `final_hits` in case it contains more than `size` hits
+    //
+    // Technically, we are safe to not do this. But since it is trivial, double-check it.
+    final_hits.truncate(size as usize);
+
     if final_hits.len() < 5 {
         //TODO: Add a recommendation system to suggest more sources
         log::info!(
@@ -395,30 +396,85 @@ async fn query_coco_fusion_multi_query_sources(
     })
 }
 
-fn boosted_levenshtein_rerank(query: &str, titles: Vec<(usize, &str)>) -> Vec<(usize, f64)> {
-    use strsim::levenshtein;
+use std::collections::HashSet;
+use strsim::levenshtein;
 
+fn boosted_levenshtein_rerank(
+    query: &str,
+    all_hits_grouped_by_source_id: &mut HashMap<String, Vec<QueryHits>>,
+) {
     let query_lower = query.to_lowercase();
 
-    titles
-        .into_iter()
-        .map(|(idx, title)| {
-            let mut score = 0.0;
+    for (source_id, hits) in all_hits_grouped_by_source_id.iter_mut() {
+        // Skip special sources like calculator
+        if source_id == crate::extension::built_in::calculator::DATA_SOURCE_ID {
+            continue;
+        }
+
+        for hit in hits.iter_mut() {
+            let document_title = hit.document.title.as_deref().unwrap_or("");
+            let document_title_lowercase = document_title.to_lowercase();
 
-            if title.contains(query) {
-                score += 0.4;
-            } else if title.to_lowercase().contains(&query_lower) {
-                score += 0.2;
-            }
+            let new_score = {
+                let mut score = 0.0;
 
-            let dist = levenshtein(&query_lower, &title.to_lowercase());
-            let max_len = query_lower.len().max(title.len());
-            if max_len > 0 {
-                score += (1.0 - (dist as f64 / max_len as f64)) as f32;
-            }
+                // --- Exact or substring boost ---
+                if document_title.contains(query) {
+                    score += 0.4;
+                } else if document_title_lowercase.contains(&query_lower) {
+                    score += 0.2;
+                }
+
+                // --- Levenshtein distance (character similarity) ---
+                let dist = levenshtein(&query_lower, &document_title_lowercase);
+                let max_len = query_lower.len().max(document_title.len());
+                let levenshtein_score = if max_len > 0 {
+                    (1.0 - (dist as f64 / max_len as f64)) as f32
+                } else {
+                    0.0
+                };
+
+                // --- Jaccard similarity (token overlap) ---
+                let jaccard_score = jaccard_similarity(&query_lower, &document_title_lowercase);
+
+                // --- Combine scores (weights adjustable) ---
+                // Levenshtein emphasizes surface similarity
+                // Jaccard emphasizes term overlap (semantic hint)
+                let hybrid_score = 0.7 * levenshtein_score + 0.3 * jaccard_score;
+
+                // --- Apply hybrid score ---
+                score += hybrid_score;
+
+                // --- Limit score range ---
+                score.min(1.0) as f64
+            };
+
+            hit.score = new_score;
+        }
+    }
+}
+
+/// Compute token-based Jaccard similarity
+fn jaccard_similarity(a: &str, b: &str) -> f32 {
+    let a_tokens: HashSet<_> = tokenize(a).into_iter().collect();
+    let b_tokens: HashSet<_> = tokenize(b).into_iter().collect();
+
+    if a_tokens.is_empty() || b_tokens.is_empty() {
+        return 0.0;
+    }
+
+    let intersection = a_tokens.intersection(&b_tokens).count() as f32;
+    let union = a_tokens.union(&b_tokens).count() as f32;
+
+    intersection / union
+}
 
-            (idx, score.min(1.0) as f64)
-        })
+/// Basic tokenizer (case-insensitive, alphanumeric words only)
+fn tokenize(text: &str) -> Vec<String> {
+    text.to_lowercase()
+        .split(|c: char| !c.is_alphanumeric())
+        .filter(|s| !s.is_empty())
+        .map(|s| s.to_string())
         .collect()
 }
 
diff --git a/src/hooks/useSearch.ts b/src/hooks/useSearch.ts
@@ -1,5 +1,5 @@
 import { useState, useCallback, useMemo, useRef } from "react";
-import { debounce } from "lodash-es";
+import { debounce, orderBy } from "lodash-es";
 
 import type {
   QueryHits,
@@ -65,7 +65,9 @@ export function useSearch() {
     response: MultiSourceQueryResponse,
     searchInput: string
   ) => {
-    const data = response?.hits || [];
+    const hits = response?.hits ?? [];
+
+    const data = orderBy(hits, "score", "desc");
 
     const searchData = data.reduce(
       (acc: SearchDataBySource, item: QueryHits) => {