superlinear-ai · jirastorza · Oct 15, 2025 · Oct 21, 2025 · Oct 21, 2025 · Oct 21, 2025
diff --git a/src/raglite/_config.py b/src/raglite/_config.py
@@ -80,3 +80,4 @@ class RAGLiteConfig:
     # Search config: you can pick any search method that returns (list[ChunkId], list[float]),
     # list[Chunk], or list[ChunkSpan].
     search_method: SearchMethod = field(default=_vector_search, compare=False)
+    _num_queries: int = 1
diff --git a/src/raglite/_rag.py b/src/raglite/_rag.py
@@ -1,7 +1,10 @@
 """Retrieval-augmented generation."""
 
 import json
+import logging
+import warnings
 from collections.abc import AsyncIterator, Callable, Iterator
+from dataclasses import replace
 from typing import Any
 
 import numpy as np
@@ -19,6 +22,8 @@
 from raglite._search import retrieve_chunk_spans
 from raglite._typing import MetadataFilter
 
+logger = logging.getLogger(__name__)
+
 # The default RAG instruction template follows Anthropic's best practices [1].
 # [1] https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/long-context-tips
 RAG_INSTRUCTION_TEMPLATE = """
@@ -58,9 +63,23 @@ def retrieve_context(
         chunk_spans = retrieve_chunk_spans(results, config=config)  # type: ignore[arg-type]
     elif all(isinstance(result, ChunkSpan) for result in results):
         chunk_spans = results  # type: ignore[assignment]
+    chunk_spans = limit_chunkspans(chunk_spans, config)
     return chunk_spans
 
 
+def limit_chunkspans(chunk_spans: list[ChunkSpan], config: RAGLiteConfig) -> list[ChunkSpan]:
+    max_tokens = get_context_size(config) // config._num_queries - 300 // config._num_queries  # noqa: SLF001
+    cum_tokens = np.cumsum([len(chunk_span.to_json()) // 3 for chunk_span in chunk_spans])
+    first_chunk = np.searchsorted(cum_tokens, max_tokens)
+    if first_chunk < len(chunk_spans):
+        logger.warning(
+            "Retrieved chunks exceed context window. "
+            "Truncating to %d chunk(s). Consider reducing the number of retrieved chunks or using a model with bigger context window.",
+            first_chunk // len(chunk_spans),
+        )
+    return chunk_spans[:first_chunk]
+
+
 def add_context(
     user_prompt: str,
     context: list[ChunkSpan],
@@ -89,6 +108,15 @@ def _clip(messages: list[dict[str, str]], max_tokens: int) -> list[dict[str, str
     """Left clip a messages array to avoid hitting the context limit."""
     cum_tokens = np.cumsum([len(message.get("content") or "") // 3 for message in messages][::-1])
     first_message = -np.searchsorted(cum_tokens, max_tokens)
+    if first_message == 0 and cum_tokens[-1] > max_tokens:
+        warnings.warn(
+            (
+                f"Context window of {max_tokens} tokens exceeded even after clipping all previous messages."
+                "Consider using a model with a bigger context window or reducing the number of retrieved chunks."
+            ),
+            stacklevel=2,
+        )
+        return []
     return messages[first_message:]
 
 
@@ -147,6 +175,7 @@ def _run_tools(
     config: RAGLiteConfig,
 ) -> list[dict[str, Any]]:
     """Run tools to search the knowledge base for RAG context."""
+    config = replace(config, _num_queries=len(tool_calls))
     tool_messages: list[dict[str, Any]] = []
     for tool_call in tool_calls:
         if tool_call.function.name == "search_knowledge_base":