Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 98 additions & 15 deletions backend/app/api/v1/arxiv.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# filepath: backend/app/api/v1/arxiv.py
import itertools
import math
import re
import xml.etree.ElementTree as ET
from collections import Counter
from datetime import datetime, timezone
from typing import List
from urllib.parse import quote
Expand Down Expand Up @@ -47,6 +49,15 @@
"authority": 0.15,
}

# relevance scoring hyperparameters
BM25_K1 = 1.4
BM25_B = 0.65
TITLE_BM25_WEIGHT = 1.2
SUMMARY_BM25_WEIGHT = 1.0
AUTHOR_BM25_WEIGHT = 0.6
PHRASE_MATCH_BONUS = 0.1
RELEVANCE_COMBINATION = (0.65, 0.25, 0.10) # bm25, token overlap, phrase bonus

# multi-batch candidate expansion tuning
DEFAULT_BATCH_SIZE = 25
MAX_BATCH_SIZE = 50
Expand Down Expand Up @@ -198,6 +209,35 @@ def tokenize(text: str) -> List[str]:
query_tokens = tokenize(query)
query_token_set = set(query_tokens)

# pre-compute tokenization and corpus stats for stronger BM25-style relevance
tokenized_candidates: list[dict[str, list[str]]] = []
doc_freq = Counter()
combined_length = 0

for paper in candidates:
title_tokens = tokenize(paper.get("title", ""))
summary_tokens = tokenize(paper.get("summary", ""))
author_tokens: List[str] = []
for name in paper.get("authors", []):
author_tokens.extend(tokenize(name))

combined_tokens = list(itertools.chain(title_tokens, summary_tokens, author_tokens))
combined_length += len(combined_tokens)
for token in set(combined_tokens):
doc_freq[token] += 1

tokenized = {
"title": title_tokens,
"summary": summary_tokens,
"authors": author_tokens,
"combined": combined_tokens,
}
tokenized_candidates.append(tokenized)
paper["_tokens"] = tokenized

doc_count = len(tokenized_candidates)
avg_combined_len = combined_length / doc_count if doc_count else 1.0

def parse_date(date_str: str):
if not date_str:
return None
Expand All @@ -211,27 +251,69 @@ def overlap_score(target_tokens: List[str]) -> float:
return 0.0
return len(query_token_set & set(target_tokens)) / len(query_token_set)

def compute_bm25(field_tokens: List[str]) -> float:
if not query_tokens or not field_tokens:
return 0.0

term_freq = Counter(field_tokens)
doc_len = len(field_tokens)
score = 0.0

for term in query_tokens:
df = doc_freq.get(term, 0)
if df == 0 or doc_count == 0:
continue

# classic BM25 idf with added 1 to keep score non-negative
idf = math.log(1 + (doc_count - df + 0.5) / (df + 0.5))
tf = term_freq.get(term, 0)
if tf == 0:
continue

denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (doc_len / avg_combined_len))
score += idf * tf * (BM25_K1 + 1) / denom

return score

def phrase_bonus(paper: dict) -> float:
# reward contiguous phrase matches in title or summary to bias toward focused hits
if len(query_tokens) < 2:
return 0.0

phrase = " ".join(query_tokens)
haystacks = [
" ".join(paper.get("_tokens", {}).get("title", [])),
" ".join(paper.get("_tokens", {}).get("summary", [])),
]
return PHRASE_MATCH_BONUS if any(phrase and phrase in h for h in haystacks) else 0.0

def compute_relevance(paper: dict) -> float:
if not query_token_set:
return 0.0

TITLE_W = 0.55
SUMMARY_W = 0.35
AUTHOR_W = 0.10
total = TITLE_W + SUMMARY_W + AUTHOR_W
tokens = paper.get("_tokens", {})
title_tokens = tokens.get("title", [])
summary_tokens = tokens.get("summary", [])
author_tokens = tokens.get("authors", [])
combined_tokens = tokens.get("combined", [])

title_tokens = tokenize(paper.get("title", ""))
summary_tokens = tokenize(paper.get("summary", ""))
author_tokens: List[str] = []
for name in paper.get("authors", []):
author_tokens.extend(tokenize(name))
bm25_title = compute_bm25(title_tokens)
bm25_summary = compute_bm25(summary_tokens)
bm25_authors = compute_bm25(author_tokens)

blended = (
TITLE_W * overlap_score(title_tokens)
+ SUMMARY_W * overlap_score(summary_tokens)
+ AUTHOR_W * overlap_score(author_tokens)
)
return blended / total if total else 0.0
weighted_bm25 = (
TITLE_BM25_WEIGHT * bm25_title
+ SUMMARY_BM25_WEIGHT * bm25_summary
+ AUTHOR_BM25_WEIGHT * bm25_authors
) / (TITLE_BM25_WEIGHT + SUMMARY_BM25_WEIGHT + AUTHOR_BM25_WEIGHT)

normalized_bm25 = math.tanh(weighted_bm25)
coverage = overlap_score(combined_tokens)
bonus = phrase_bonus(paper)

bm25_w, overlap_w, bonus_w = RELEVANCE_COMBINATION
composite = bm25_w * normalized_bm25 + overlap_w * coverage + bonus_w * bonus
return min(1.0, composite)

def compute_recency(paper: dict) -> float:
date_str = paper.get("updated") or paper.get("published")
Expand Down Expand Up @@ -285,6 +367,7 @@ def score_paper(paper: dict) -> float:
selected = candidates[:max_results]
for paper in selected:
paper.pop("_score", None)
paper.pop("_tokens", None)

return {
"query": query,
Expand Down
Empty file modified scripts/dev.sh
100644 → 100755
Empty file.