Dynamite2003 · Chusi-Truth · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 20, 2025
diff --git a/backend/app/api/v1/arxiv.py b/backend/app/api/v1/arxiv.py
@@ -1,7 +1,9 @@
 # filepath: backend/app/api/v1/arxiv.py
+import itertools
 import math
 import re
 import xml.etree.ElementTree as ET
+from collections import Counter
 from datetime import datetime, timezone
 from typing import List
 from urllib.parse import quote
@@ -47,6 +49,15 @@
     "authority": 0.15,
 }
 
+# relevance scoring hyperparameters
+BM25_K1 = 1.4
+BM25_B = 0.65
+TITLE_BM25_WEIGHT = 1.2
+SUMMARY_BM25_WEIGHT = 1.0
+AUTHOR_BM25_WEIGHT = 0.6
+PHRASE_MATCH_BONUS = 0.1
+RELEVANCE_COMBINATION = (0.65, 0.25, 0.10)  # bm25, token overlap, phrase bonus
+
 # multi-batch candidate expansion tuning
 DEFAULT_BATCH_SIZE = 25
 MAX_BATCH_SIZE = 50
@@ -198,6 +209,35 @@ def tokenize(text: str) -> List[str]:
         query_tokens = tokenize(query)
         query_token_set = set(query_tokens)
 
+        # pre-compute tokenization and corpus stats for stronger BM25-style relevance
+        tokenized_candidates: list[dict[str, list[str]]] = []
+        doc_freq = Counter()
+        combined_length = 0
+
+        for paper in candidates:
+            title_tokens = tokenize(paper.get("title", ""))
+            summary_tokens = tokenize(paper.get("summary", ""))
+            author_tokens: List[str] = []
+            for name in paper.get("authors", []):
+                author_tokens.extend(tokenize(name))
+
+            combined_tokens = list(itertools.chain(title_tokens, summary_tokens, author_tokens))
+            combined_length += len(combined_tokens)
+            for token in set(combined_tokens):
+                doc_freq[token] += 1
+
+            tokenized = {
+                "title": title_tokens,
+                "summary": summary_tokens,
+                "authors": author_tokens,
+                "combined": combined_tokens,
+            }
+            tokenized_candidates.append(tokenized)
+            paper["_tokens"] = tokenized
+
+        doc_count = len(tokenized_candidates)
+        avg_combined_len = combined_length / doc_count if doc_count else 1.0
+
         def parse_date(date_str: str):
             if not date_str:
                 return None
@@ -211,27 +251,69 @@ def overlap_score(target_tokens: List[str]) -> float:
                 return 0.0
             return len(query_token_set & set(target_tokens)) / len(query_token_set)
 
+        def compute_bm25(field_tokens: List[str]) -> float:
+            if not query_tokens or not field_tokens:
+                return 0.0
+
+            term_freq = Counter(field_tokens)
+            doc_len = len(field_tokens)
+            score = 0.0
+
+            for term in query_tokens:
+                df = doc_freq.get(term, 0)
+                if df == 0 or doc_count == 0:
+                    continue
+
+                # classic BM25 idf with added 1 to keep score non-negative
+                idf = math.log(1 + (doc_count - df + 0.5) / (df + 0.5))
+                tf = term_freq.get(term, 0)
+                if tf == 0:
+                    continue
+
+                denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc_len / avg_combined_len))
+                score += idf * tf * (BM25_K1 + 1) / denom
+
+            return score
+
+        def phrase_bonus(paper: dict) -> float:
+            # reward contiguous phrase matches in title or summary to bias toward focused hits
+            if len(query_tokens) < 2:
+                return 0.0
+
+            phrase = " ".join(query_tokens)
+            haystacks = [
+                " ".join(paper.get("_tokens", {}).get("title", [])),
+                " ".join(paper.get("_tokens", {}).get("summary", [])),
+            ]
+            return PHRASE_MATCH_BONUS if any(phrase and phrase in h for h in haystacks) else 0.0
+
         def compute_relevance(paper: dict) -> float:
             if not query_token_set:
                 return 0.0
 
-            TITLE_W = 0.55
-            SUMMARY_W = 0.35
-            AUTHOR_W = 0.10
-            total = TITLE_W + SUMMARY_W + AUTHOR_W
+            tokens = paper.get("_tokens", {})
+            title_tokens = tokens.get("title", [])
+            summary_tokens = tokens.get("summary", [])
+            author_tokens = tokens.get("authors", [])
+            combined_tokens = tokens.get("combined", [])
 
-            title_tokens = tokenize(paper.get("title", ""))
-            summary_tokens = tokenize(paper.get("summary", ""))
-            author_tokens: List[str] = []
-            for name in paper.get("authors", []):
-                author_tokens.extend(tokenize(name))
+            bm25_title = compute_bm25(title_tokens)
+            bm25_summary = compute_bm25(summary_tokens)
+            bm25_authors = compute_bm25(author_tokens)
 
-            blended = (
-                TITLE_W * overlap_score(title_tokens)
-                + SUMMARY_W * overlap_score(summary_tokens)
-                + AUTHOR_W * overlap_score(author_tokens)
-            )
-            return blended / total if total else 0.0
+            weighted_bm25 = (
+                TITLE_BM25_WEIGHT * bm25_title
+                + SUMMARY_BM25_WEIGHT * bm25_summary
+                + AUTHOR_BM25_WEIGHT * bm25_authors
+            ) / (TITLE_BM25_WEIGHT + SUMMARY_BM25_WEIGHT + AUTHOR_BM25_WEIGHT)
+
+            normalized_bm25 = math.tanh(weighted_bm25)
+            coverage = overlap_score(combined_tokens)
+            bonus = phrase_bonus(paper)
+
+            bm25_w, overlap_w, bonus_w = RELEVANCE_COMBINATION
+            composite = bm25_w * normalized_bm25 + overlap_w * coverage + bonus_w * bonus
+            return min(1.0, composite)
 
         def compute_recency(paper: dict) -> float:
             date_str = paper.get("updated") or paper.get("published")
@@ -285,6 +367,7 @@ def score_paper(paper: dict) -> float:
         selected = candidates[:max_results]
         for paper in selected:
             paper.pop("_score", None)
+            paper.pop("_tokens", None)
 
         return {
             "query": query,

diff --git a/scripts/dev.sh b/scripts/dev.sh