From a967f04968886cb3fbbc16dd2b7bc86603b989c7 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Mon, 22 Sep 2025 20:30:03 +0000 Subject: [PATCH] refactor(chat): use extractTextFromMessageContent to normalize message content - Introduced extractTextFromMessageContent function to handle message content that can be string or array of message parts. - Updated chat.ts, calculate-prompt-tokens.ts, and estimate-tokens.ts to use this function for consistent content extraction before token encoding. - This change improves compatibility with gpt-tokenizer which expects string content. - Added extractTextFromMessageContent implementation in types.ts for reuse. Co-authored-by: terragon-labs[bot] --- apps/gateway/src/chat/chat.ts | 13 ++++++------ .../src/chat/tools/calculate-prompt-tokens.ts | 9 ++++++--- .../gateway/src/chat/tools/estimate-tokens.ts | 11 +++++----- apps/gateway/src/chat/tools/types.ts | 20 +++++++++++++++++++ 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index 7b1fbe0fc..55213b03f 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -54,7 +54,11 @@ import { getProviderEnv } from "./tools/get-provider-env.js"; import { parseProviderResponse } from "./tools/parse-provider-response.js"; import { transformResponseToOpenai } from "./tools/transform-response-to-openai.js"; import { transformStreamingToOpenai } from "./tools/transform-streaming-to-openai.js"; -import { type ChatMessage, DEFAULT_TOKENIZER_MODEL } from "./tools/types.js"; +import { + type ChatMessage, + DEFAULT_TOKENIZER_MODEL, + extractTextFromMessageContent, +} from "./tools/types.js"; import { validateFreeModelUsage } from "./tools/validate-free-model-usage.js"; import type { ServerTypes } from "@/vars.js"; @@ -716,10 +720,7 @@ chat.openapi(completions, async (c) => { try { const chatMessages: ChatMessage[] = messages.map((m) => ({ role: m.role as "user" | "assistant" | "system" | undefined, - content: - typeof m.content === "string" - ? m.content - : JSON.stringify(m.content), + content: extractTextFromMessageContent(m.content), name: m.name, })); requiredContextSize = encodeChat( @@ -2530,7 +2531,7 @@ chat.openapi(completions, async (c) => { // Convert messages to the format expected by gpt-tokenizer const chatMessages: any[] = messages.map((m) => ({ role: m.role as "user" | "assistant" | "system" | undefined, - content: m.content || "", + content: extractTextFromMessageContent(m.content) || "", name: m.name, })); calculatedPromptTokens = encodeChat( diff --git a/apps/gateway/src/chat/tools/calculate-prompt-tokens.ts b/apps/gateway/src/chat/tools/calculate-prompt-tokens.ts index 2a0241cf4..ab5ea0fd7 100644 --- a/apps/gateway/src/chat/tools/calculate-prompt-tokens.ts +++ b/apps/gateway/src/chat/tools/calculate-prompt-tokens.ts @@ -1,6 +1,10 @@ import { encodeChat } from "gpt-tokenizer"; -import { type ChatMessage, DEFAULT_TOKENIZER_MODEL } from "./types.js"; +import { + type ChatMessage, + DEFAULT_TOKENIZER_MODEL, + extractTextFromMessageContent, +} from "./types.js"; /** * Transforms streaming chunk to OpenAI format for non-OpenAI providers @@ -10,8 +14,7 @@ export function calculatePromptTokensFromMessages(messages: any[]): number { try { const chatMessages: ChatMessage[] = messages.map((m: any) => ({ role: m.role, - content: - typeof m.content === "string" ? m.content : JSON.stringify(m.content), + content: extractTextFromMessageContent(m.content), name: m.name, })); return encodeChat(chatMessages, DEFAULT_TOKENIZER_MODEL).length; diff --git a/apps/gateway/src/chat/tools/estimate-tokens.ts b/apps/gateway/src/chat/tools/estimate-tokens.ts index c62343d54..8e5bfb4a7 100644 --- a/apps/gateway/src/chat/tools/estimate-tokens.ts +++ b/apps/gateway/src/chat/tools/estimate-tokens.ts @@ -2,7 +2,11 @@ import { encode, encodeChat } from "gpt-tokenizer"; import { logger } from "@llmgateway/logger"; -import { type ChatMessage, DEFAULT_TOKENIZER_MODEL } from "./types.js"; +import { + type ChatMessage, + DEFAULT_TOKENIZER_MODEL, + extractTextFromMessageContent, +} from "./types.js"; import type { Provider } from "@llmgateway/models"; @@ -27,10 +31,7 @@ export function estimateTokens( // Convert messages to the format expected by gpt-tokenizer const chatMessages: ChatMessage[] = messages.map((m) => ({ role: m.role, - content: - typeof m.content === "string" - ? m.content - : JSON.stringify(m.content), + content: extractTextFromMessageContent(m.content), name: m.name, })); calculatedPromptTokens = encodeChat( diff --git a/apps/gateway/src/chat/tools/types.ts b/apps/gateway/src/chat/tools/types.ts index fc5631cb7..f9341a613 100644 --- a/apps/gateway/src/chat/tools/types.ts +++ b/apps/gateway/src/chat/tools/types.ts @@ -7,6 +7,26 @@ export interface ChatMessage { name?: string; } +/** + * Extracts text content from a message content field, handling both string and array formats + * This function is necessary because BaseMessage.content can be string | MessageContent[] + * but gpt-tokenizer expects only strings + */ +export function extractTextFromMessageContent(content: string | any[]): string { + if (typeof content === "string") { + return content; + } + + if (Array.isArray(content)) { + return content + .filter((part: any) => part.type === "text") + .map((part: any) => part.text || "") + .join(" "); + } + + return ""; +} + // Define OpenAI-compatible image object type export interface ImageObject { type: "image_url";