From cbb7160e94ece3d092eb93c4391601b08376ff25 Mon Sep 17 00:00:00 2001
From: VibhorGautam <vibhorgautam907@gmail.com>
Date: Sun, 8 Mar 2026 22:27:24 +0530
Subject: [PATCH 1/2] fix: limit scraped content size to prevent excessive
 token usage

Scraped web pages were being sent to the LLM in full, with no
truncation. A single large page could produce 100K+ tokens of markdown,
easily exceeding the model's context window.

Use the existing splitText utility to cap scraped content at ~6000 tokens
per page. Also add per-result and total character limits when assembling
the final context for the writer prompt.

Fixes #1031
---
 src/lib/agents/search/api.ts                  | 27 ++++++++++++++-----
 src/lib/agents/search/index.ts                | 27 ++++++++++++++-----
 .../search/researcher/actions/scrapeURL.ts    |  9 ++++++-
 3 files changed, 48 insertions(+), 15 deletions(-)
diff --git a/src/lib/agents/search/api.ts b/src/lib/agents/search/api.ts
index 924bc68f4..9f98d405d 100644
--- a/src/lib/agents/search/api.ts
+++ b/src/lib/agents/search/api.ts
@@ -49,13 +49,26 @@ class APISearchAgent {
       type: 'researchComplete',
     });
 
-    const finalContext =
-      searchResults?.searchFindings
-        .map(
-          (f, index) =>
-            `<result index=${index + 1} title=${f.metadata.title}>${f.content}</result>`,
-        )
-        .join('\n') || '';
+    // Cap each result and total context to stay within reasonable token budgets
+    const maxCharsPerResult = 24000;
+    const maxTotalChars = 80000;
+
+    let totalChars = 0;
+    const contextParts: string[] = [];
+
+    if (searchResults?.searchFindings) {
+      for (let i = 0; i < searchResults.searchFindings.length; i++) {
+        const f = searchResults.searchFindings[i];
+        const truncated = f.content.slice(0, maxCharsPerResult);
+        const part = `<result index=${i + 1} title=${f.metadata.title}>${truncated}</result>`;
+
+        if (totalChars + part.length > maxTotalChars) break;
+        totalChars += part.length;
+        contextParts.push(part);
+      }
+    }
+
+    const finalContext = contextParts.join('\n');
 
     const widgetContext = widgetOutputs
       .map((o) => {
diff --git a/src/lib/agents/search/index.ts b/src/lib/agents/search/index.ts
index 859183293..3711b2d27 100644
--- a/src/lib/agents/search/index.ts
+++ b/src/lib/agents/search/index.ts
@@ -98,13 +98,26 @@ class SearchAgent {
       type: 'researchComplete',
     });
 
-    const finalContext =
-      searchResults?.searchFindings
-        .map(
-          (f, index) =>
-            `<result index=${index + 1} title=${f.metadata.title}>${f.content}</result>`,
-        )
-        .join('\n') || '';
+    // Cap each result and total context to stay within reasonable token budgets
+    const maxCharsPerResult = 24000;
+    const maxTotalChars = 80000;
+
+    let totalChars = 0;
+    const contextParts: string[] = [];
+
+    if (searchResults?.searchFindings) {
+      for (let i = 0; i < searchResults.searchFindings.length; i++) {
+        const f = searchResults.searchFindings[i];
+        const truncated = f.content.slice(0, maxCharsPerResult);
+        const part = `<result index=${i + 1} title=${f.metadata.title}>${truncated}</result>`;
+
+        if (totalChars + part.length > maxTotalChars) break;
+        totalChars += part.length;
+        contextParts.push(part);
+      }
+    }
+
+    const finalContext = contextParts.join('\n');
 
     const widgetContext = widgetOutputs
       .map((o) => {
diff --git a/src/lib/agents/search/researcher/actions/scrapeURL.ts b/src/lib/agents/search/researcher/actions/scrapeURL.ts
index c702a7014..8bf2c79c8 100644
--- a/src/lib/agents/search/researcher/actions/scrapeURL.ts
+++ b/src/lib/agents/search/researcher/actions/scrapeURL.ts
@@ -3,6 +3,7 @@ import { ResearchAction } from '../../types';
 import { Chunk, ReadingResearchBlock } from '@/lib/types';
 import TurnDown from 'turndown';
 import path from 'path';
+import { splitText } from '@/lib/utils/splitText';
 
 const turndownService = new TurnDown();
 
@@ -110,8 +111,14 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
 
           const markdown = turndownService.turndown(text);
 
+          // Limit scraped content to avoid blowing up the context window.
+          // splitText chunks by token count — we only keep the first chunk.
+          const maxTokensPerPage = 6000;
+          const chunks = splitText(markdown, maxTokensPerPage, 0);
+          const content = chunks.length > 0 ? chunks[0] : markdown;
+
           results.push({
-            content: markdown,
+            content,
             metadata: {
               url,
               title: title,

From 1c666479bc16fe29ea1aea782de6e14974638547 Mon Sep 17 00:00:00 2001
From: VibhorGautam <vibhorgautam907@gmail.com>
Date: Sun, 8 Mar 2026 22:35:55 +0530
Subject: [PATCH 2/2] fix: cap raw HTML before markdown conversion

Truncate the HTML to 200K chars before passing it to Turndown so we
don't waste CPU converting huge pages we mostly discard after
tokenization anyway.
---
 src/lib/agents/search/researcher/actions/scrapeURL.ts | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/lib/agents/search/researcher/actions/scrapeURL.ts b/src/lib/agents/search/researcher/actions/scrapeURL.ts
index 8bf2c79c8..c7f297d43 100644
--- a/src/lib/agents/search/researcher/actions/scrapeURL.ts
+++ b/src/lib/agents/search/researcher/actions/scrapeURL.ts
@@ -41,11 +41,18 @@ const scrapeURLAction: ResearchAction<typeof schema> = {
       params.urls.map(async (url) => {
         try {
           const res = await fetch(url);
-          const text = await res.text();
+          let text = await res.text();
 
           const title =
             text.match(/<title>(.*?)<\/title>/i)?.[1] || `Content from ${url}`;
 
+          // Cap raw HTML before Turndown so we don't spend CPU converting
+          // megabytes of markup we'll mostly throw away after tokenization.
+          const maxHtmlChars = 200_000;
+          if (text.length > maxHtmlChars) {
+            text = text.slice(0, maxHtmlChars);
+          }
+
           if (
             !readingEmitted &&
             researchBlock &&