diff --git a/.changeset/add-zai-glm-4-7-cerebras-model.md b/.changeset/add-zai-glm-4-7-cerebras-model.md
new file mode 100644
index 00000000000..141553f5f1d
--- /dev/null
+++ b/.changeset/add-zai-glm-4-7-cerebras-model.md
@@ -0,0 +1,5 @@
+---
+"kilo-code": patch
+---
+
+Add `zai-glm-4.7` to Cerebras models
diff --git a/.changeset/cmdv-image-paste-macos.md b/.changeset/cmdv-image-paste-macos.md
deleted file mode 100644
index 778e74b47b4..00000000000
--- a/.changeset/cmdv-image-paste-macos.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-"kilo-code": patch
----
-
-Support Cmd+V for pasting images on macOS in VSCode terminal
-
-- Detect empty bracketed paste (when clipboard contains image instead of text)
-- Trigger clipboard image check on empty paste or paste timeout
-- Add Cmd+V (meta key) support alongside Ctrl+V for image paste
diff --git a/.changeset/cute-flies-dance.md b/.changeset/cute-flies-dance.md
new file mode 100644
index 00000000000..753c120f407
--- /dev/null
+++ b/.changeset/cute-flies-dance.md
@@ -0,0 +1,5 @@
+---
+"kilo-code": patch
+---
+
+Improved prompt caching when using Anthropic models on OpenRouter with native tool calling
diff --git a/.changeset/enable-jetbrains-autocomplete.md b/.changeset/enable-jetbrains-autocomplete.md
deleted file mode 100644
index 109f89977ea..00000000000
--- a/.changeset/enable-jetbrains-autocomplete.md
+++ /dev/null
@@ -1,5 +0,0 @@
----
-"kilo-code": patch
----
-
-Enable autocomplete by default in the JetBrains extension
diff --git a/.changeset/fix-vscode-paste-truncation.md b/.changeset/fix-vscode-paste-truncation.md
deleted file mode 100644
index 36a1c97f667..00000000000
--- a/.changeset/fix-vscode-paste-truncation.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-"kilo-code": patch
----
-
-Fix paste truncation in VSCode terminal
-
-- Prevent React StrictMode cleanup from interrupting paste operations
-- Remove `completePaste()` and `clearBuffers()` from useEffect cleanup
-- Paste buffer refs now persist across React re-mounts and flush properly when paste end marker is received
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9fcf33f0562..638ca212ae5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,29 @@
 # kilo-code
 
+## 4.143.1
+
+### Patch Changes
+
+- [#4832](https://github.com/Kilo-Org/kilocode/pull/4832) [`22a4ebf`](https://github.com/Kilo-Org/kilocode/commit/22a4ebfcd9f885b6ef9979dc6830226db9a4f397) Thanks [@Drilmo](https://github.com/Drilmo)! - Support Cmd+V for pasting images on macOS in VSCode terminal
+
+    - Detect empty bracketed paste (when clipboard contains image instead of text)
+    - Trigger clipboard image check on empty paste or paste timeout
+    - Add Cmd+V (meta key) support alongside Ctrl+V for image paste
+
+- [#3856](https://github.com/Kilo-Org/kilocode/pull/3856) [`91e0a17`](https://github.com/Kilo-Org/kilocode/commit/91e0a1788963b8be50c58881f11ded96516ab163) Thanks [@markijbema](https://github.com/markijbema)! - Faster autocomplete when using the Mistral provider
+
+- [#4839](https://github.com/Kilo-Org/kilocode/pull/4839) [`abaada6`](https://github.com/Kilo-Org/kilocode/commit/abaada6b7ced6d3f4e37e69441e722e453289b81) Thanks [@markijbema](https://github.com/markijbema)! - Enable autocomplete by default in the JetBrains extension
+
+- [#4831](https://github.com/Kilo-Org/kilocode/pull/4831) [`a9cbb2c`](https://github.com/Kilo-Org/kilocode/commit/a9cbb2cebd75e0c675dc3b55e7a1653ccb93921b) Thanks [@Drilmo](https://github.com/Drilmo)! - Fix paste truncation in VSCode terminal
+
+    - Prevent React StrictMode cleanup from interrupting paste operations
+    - Remove `completePaste()` and `clearBuffers()` from useEffect cleanup
+    - Paste buffer refs now persist across React re-mounts and flush properly when paste end marker is received
+
+- [#4847](https://github.com/Kilo-Org/kilocode/pull/4847) [`8ee812a`](https://github.com/Kilo-Org/kilocode/commit/8ee812a18da5da691bf76ee5c5d9d94cfb678f25) Thanks [@chrarnoldus](https://github.com/chrarnoldus)! - Disable structured outputs for Anthropic models, because the tool schema doesn't yet support it
+
+- [#4843](https://github.com/Kilo-Org/kilocode/pull/4843) [`0e3520a`](https://github.com/Kilo-Org/kilocode/commit/0e3520a0aa9a74f7a28af1f820558d2343fd4fba) Thanks [@markijbema](https://github.com/markijbema)! - Filter unhelpful suggestions in chat autocomplete
+
 ## 4.143.0
 
 ### Minor Changes
diff --git a/apps/kilocode-docs/docs/providers/cerebras.md b/apps/kilocode-docs/docs/providers/cerebras.md
index 5a92ce88246..14ab0289b2f 100644
--- a/apps/kilocode-docs/docs/providers/cerebras.md
+++ b/apps/kilocode-docs/docs/providers/cerebras.md
@@ -20,7 +20,8 @@ Cerebras is known for their ultra-fast AI inference powered by the Cerebras CS-3
 Kilo Code supports the following Cerebras models:
 
 - `gpt-oss-120b` (Default) – High-performance open-source model optimized for fast inference
-- `zai-glm-4.6` – Advanced GLM model with enhanced reasoning capabilities
+- `zai-glm-4.6` – Fast general-purpose model on Cerebras (up to 1,000 tokens/s). To be deprecated soon.
+- `zai-glm-4.7` – Highly capable general-purpose model on Cerebras (up to 1,000 tokens/s), competitive with leading proprietary models on coding tasks.
 
 Refer to the [Cerebras documentation](https://docs.cerebras.ai/) for detailed information on model capabilities and performance characteristics.
 
diff --git a/packages/types/src/providers/cerebras.ts b/packages/types/src/providers/cerebras.ts
index 1f28c00bdfd..c5f770d4b2a 100644
--- a/packages/types/src/providers/cerebras.ts
+++ b/packages/types/src/providers/cerebras.ts
@@ -14,7 +14,18 @@ export const cerebrasModels = {
 		supportsNativeTools: true,
 		inputPrice: 0,
 		outputPrice: 0,
-		description: "Highly intelligent general purpose model with up to 1,000 tokens/s",
+		description: "Fast general-purpose model on Cerebras (up to 1,000 tokens/s). To be deprecated soon.",
+	},
+	"zai-glm-4.7": {
+		maxTokens: 16384, // Conservative default to avoid premature rate limiting (Cerebras reserves quota upfront)
+		contextWindow: 131072,
+		supportsImages: false,
+		supportsPromptCache: false,
+		supportsNativeTools: true,
+		inputPrice: 0,
+		outputPrice: 0,
+		description:
+			"Highly capable general-purpose model on Cerebras (up to 1,000 tokens/s), competitive with leading proprietary models on coding tasks.",
 	},
 	"qwen-3-235b-a22b-instruct-2507": {
 		maxTokens: 16384, // Conservative default to avoid premature rate limiting
diff --git a/src/api/providers/__tests__/kilocode-openrouter.spec.ts b/src/api/providers/__tests__/kilocode-openrouter.spec.ts
index 016eac8fb40..c1138019118 100644
--- a/src/api/providers/__tests__/kilocode-openrouter.spec.ts
+++ b/src/api/providers/__tests__/kilocode-openrouter.spec.ts
@@ -259,26 +259,6 @@ describe("KilocodeOpenrouterHandler", () => {
 			expect(handler.supportsFim()).toBe(false)
 		})
 
-		it("completeFim handles errors correctly", async () => {
-			const handler = new KilocodeOpenrouterHandler({
-				...mockOptions,
-				kilocodeModel: "mistral/codestral-latest",
-			})
-
-			const mockResponse = {
-				ok: false,
-				status: 500,
-				statusText: "Internal Server Error",
-				text: vitest.fn().mockResolvedValue("Error details"),
-			}
-
-			global.fetch = vitest.fn().mockResolvedValue(mockResponse)
-
-			await expect(handler.completeFim("prefix", "suffix")).rejects.toThrow(
-				"FIM streaming failed: 500 Internal Server Error - Error details",
-			)
-		})
-
 		it("streamFim yields chunks correctly", async () => {
 			const handler = new KilocodeOpenrouterHandler({
 				...mockOptions,
diff --git a/src/api/providers/__tests__/mistral-fim.spec.ts b/src/api/providers/__tests__/mistral-fim.spec.ts
new file mode 100644
index 00000000000..b9ebade8fac
--- /dev/null
+++ b/src/api/providers/__tests__/mistral-fim.spec.ts
@@ -0,0 +1,180 @@
+// kilocode_change - new file
+// npx vitest run src/api/providers/__tests__/mistral-fim.spec.ts
+
+// Mock vscode first to avoid import errors
+vitest.mock("vscode", () => ({}))
+
+import { MistralHandler } from "../mistral"
+import { ApiHandlerOptions } from "../../../shared/api"
+import { streamSse } from "../../../services/continuedev/core/fetch/stream"
+
+// Mock the stream module
+vitest.mock("../../../services/continuedev/core/fetch/stream", () => ({
+	streamSse: vitest.fn(),
+}))
+
+// Mock delay
+vitest.mock("delay", () => ({ default: vitest.fn(() => Promise.resolve()) }))
+
+describe("MistralHandler FIM support", () => {
+	const mockOptions: ApiHandlerOptions = {
+		mistralApiKey: "test-api-key",
+		apiModelId: "codestral-latest",
+	}
+
+	beforeEach(() => vitest.clearAllMocks())
+
+	describe("supportsFim", () => {
+		it("returns true for codestral models", () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "codestral-latest",
+			})
+
+			expect(handler.supportsFim()).toBe(true)
+		})
+
+		it("returns true for codestral-2405", () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "codestral-2405",
+			})
+
+			expect(handler.supportsFim()).toBe(true)
+		})
+
+		it("returns false for non-codestral models", () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "mistral-large-latest",
+			})
+
+			expect(handler.supportsFim()).toBe(false)
+		})
+
+		it("returns true when no model is specified (defaults to codestral-latest)", () => {
+			const handler = new MistralHandler({
+				mistralApiKey: "test-api-key",
+			})
+
+			// Default model is codestral-latest, which supports FIM
+			expect(handler.supportsFim()).toBe(true)
+		})
+	})
+
+	describe("streamFim", () => {
+		it("yields chunks correctly", async () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "codestral-latest",
+			})
+
+			// Mock streamSse to return the expected data
+			;(streamSse as any).mockImplementation(async function* () {
+				yield { choices: [{ delta: { content: "chunk1" } }] }
+				yield { choices: [{ delta: { content: "chunk2" } }] }
+				yield { choices: [{ delta: { content: "chunk3" } }] }
+			})
+
+			const mockResponse = {
+				ok: true,
+				status: 200,
+				statusText: "OK",
+			} as Response
+
+			global.fetch = vitest.fn().mockResolvedValue(mockResponse)
+
+			const chunks: string[] = []
+
+			for await (const chunk of handler.streamFim("prefix", "suffix")) {
+				chunks.push(chunk)
+			}
+
+			expect(chunks).toEqual(["chunk1", "chunk2", "chunk3"])
+			expect(streamSse).toHaveBeenCalledWith(mockResponse)
+		})
+
+		it("handles errors correctly", async () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "codestral-latest",
+			})
+
+			const mockResponse = {
+				ok: false,
+				status: 400,
+				statusText: "Bad Request",
+				text: vitest.fn().mockResolvedValue("Invalid request"),
+			}
+
+			global.fetch = vitest.fn().mockResolvedValue(mockResponse)
+
+			const generator = handler.streamFim("prefix", "suffix")
+			await expect(generator.next()).rejects.toThrow("FIM streaming failed: 400 Bad Request - Invalid request")
+		})
+
+		it("uses correct endpoint for codestral models", async () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "codestral-latest",
+			})
+
+			;(streamSse as any).mockImplementation(async function* () {
+				yield { choices: [{ delta: { content: "test" } }] }
+			})
+
+			const mockResponse = {
+				ok: true,
+				status: 200,
+				statusText: "OK",
+			} as Response
+
+			global.fetch = vitest.fn().mockResolvedValue(mockResponse)
+
+			const generator = handler.streamFim("prefix", "suffix")
+			await generator.next()
+
+			expect(global.fetch).toHaveBeenCalledWith(
+				expect.objectContaining({
+					href: "https://codestral.mistral.ai/v1/fim/completions",
+				}),
+				expect.objectContaining({
+					method: "POST",
+					headers: expect.objectContaining({
+						Authorization: "Bearer test-api-key",
+					}),
+				}),
+			)
+		})
+
+		it("uses custom codestral URL when provided", async () => {
+			const handler = new MistralHandler({
+				...mockOptions,
+				apiModelId: "codestral-latest",
+				mistralCodestralUrl: "https://custom.codestral.url",
+			})
+
+			;(streamSse as any).mockImplementation(async function* () {
+				yield { choices: [{ delta: { content: "test" } }] }
+			})
+
+			const mockResponse = {
+				ok: true,
+				status: 200,
+				statusText: "OK",
+			} as Response
+
+			global.fetch = vitest.fn().mockResolvedValue(mockResponse)
+
+			const generator = handler.streamFim("prefix", "suffix")
+			await generator.next()
+
+			expect(global.fetch).toHaveBeenCalledWith(
+				expect.objectContaining({
+					href: "https://custom.codestral.url/v1/fim/completions",
+				}),
+				expect.any(Object),
+			)
+		})
+	})
+})
diff --git a/src/api/providers/__tests__/openrouter.spec.ts b/src/api/providers/__tests__/openrouter.spec.ts
index 2dc6a3dc1cb..ee34fd4f52f 100644
--- a/src/api/providers/__tests__/openrouter.spec.ts
+++ b/src/api/providers/__tests__/openrouter.spec.ts
@@ -71,10 +71,6 @@ describe("OpenRouterHandler", () => {
 		openRouterModelId: "anthropic/claude-sonnet-4",
 	}
 
-	// kilocode_change start
-	const anthropicBetaHeaderValue = "fine-grained-tool-streaming-2025-05-14,structured-outputs-2025-11-13"
-	// kilocode_change end
-
 	beforeEach(() => vitest.clearAllMocks())
 
 	it("initializes with correct options", () => {
@@ -208,13 +204,7 @@ describe("OpenRouterHandler", () => {
 					top_p: undefined,
 					transforms: ["middle-out"],
 				}),
-				// kilocode_change start
-				expect.objectContaining({
-					headers: expect.objectContaining({
-						"x-anthropic-beta": anthropicBetaHeaderValue,
-					}),
-				}),
-				// kilocode_change end
+				{ headers: { "x-anthropic-beta": "fine-grained-tool-streaming-2025-05-14" } },
 			)
 		})
 
@@ -239,16 +229,9 @@ describe("OpenRouterHandler", () => {
 
 			await handler.createMessage("test", []).next()
 
-			expect(mockCreate).toHaveBeenCalledWith(
-				expect.objectContaining({ transforms: ["middle-out"] }),
-				// kilocode_change start
-				expect.objectContaining({
-					headers: expect.objectContaining({
-						"x-anthropic-beta": anthropicBetaHeaderValue,
-					}),
-				}),
-				// kilocode_change end
-			)
+			expect(mockCreate).toHaveBeenCalledWith(expect.objectContaining({ transforms: ["middle-out"] }), {
+				headers: { "x-anthropic-beta": "fine-grained-tool-streaming-2025-05-14" },
+			})
 		})
 
 		it("adds cache control for supported models", async () => {
@@ -290,13 +273,7 @@ describe("OpenRouterHandler", () => {
 						}),
 					]),
 				}),
-				// kilocode_change start
-				expect.objectContaining({
-					headers: expect.objectContaining({
-						"x-anthropic-beta": anthropicBetaHeaderValue,
-					}),
-				}),
-				// kilocode_change end
+				{ headers: { "x-anthropic-beta": "fine-grained-tool-streaming-2025-05-14" } },
 			)
 		})
 
@@ -537,13 +514,7 @@ describe("OpenRouterHandler", () => {
 					messages: [{ role: "user", content: "test prompt" }],
 					stream: false,
 				},
-				// kilocode_change start
-				expect.objectContaining({
-					headers: expect.objectContaining({
-						"x-anthropic-beta": anthropicBetaHeaderValue,
-					}),
-				}),
-				// kilocode_change end
+				{ headers: { "x-anthropic-beta": "fine-grained-tool-streaming-2025-05-14" } },
 			)
 		})
 
diff --git a/src/api/providers/kilocode-openrouter.ts b/src/api/providers/kilocode-openrouter.ts
index 6bcb55ffa27..a09e00e03ba 100644
--- a/src/api/providers/kilocode-openrouter.ts
+++ b/src/api/providers/kilocode-openrouter.ts
@@ -148,14 +148,6 @@ export class KilocodeOpenrouterHandler extends OpenRouterHandler {
 		return modelId.includes("codestral")
 	}
 
-	async completeFim(prefix: string, suffix: string, taskId?: string): Promise<string> {
-		let result = ""
-		for await (const chunk of this.streamFim(prefix, suffix, taskId)) {
-			result += chunk
-		}
-		return result
-	}
-
 	async *streamFim(
 		prefix: string,
 		suffix: string,
diff --git a/src/api/providers/kilocode/IFimProvider.ts b/src/api/providers/kilocode/IFimProvider.ts
index 487301dc86b..dc964b79c48 100644
--- a/src/api/providers/kilocode/IFimProvider.ts
+++ b/src/api/providers/kilocode/IFimProvider.ts
@@ -13,15 +13,6 @@ export interface IFimProvider {
 	 */
 	supportsFim(): boolean
 
-	/**
-	 * Complete code between a prefix and suffix (non-streaming)
-	 * @param prefix - The code before the cursor/insertion point
-	 * @param suffix - The code after the cursor/insertion point
-	 * @param taskId - Optional task ID for tracking
-	 * @returns The completed code string
-	 */
-	completeFim(prefix: string, suffix: string, taskId?: string): Promise<string>
-
 	/**
 	 * Stream code completion between a prefix and suffix
 	 * @param prefix - The code before the cursor/insertion point
diff --git a/src/api/providers/mistral.ts b/src/api/providers/mistral.ts
index 96d2c332552..6ef99acd93f 100644
--- a/src/api/providers/mistral.ts
+++ b/src/api/providers/mistral.ts
@@ -11,6 +11,9 @@ import { ApiStream } from "../transform/stream"
 
 import { BaseProvider } from "./base-provider"
 import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from "../index"
+import { DEFAULT_HEADERS } from "./constants" // kilocode_change
+import { streamSse } from "../../services/continuedev/core/fetch/stream" // kilocode_change
+import type { CompletionUsage } from "./openrouter" // kilocode_change
 
 // Type helper to handle thinking chunks from Mistral API
 // The SDK includes ThinkChunk but TypeScript has trouble with the discriminated union
@@ -209,4 +212,74 @@ export class MistralHandler extends BaseProvider implements SingleCompletionHand
 			throw error
 		}
 	}
+
+	// kilocode_change start
+	supportsFim(): boolean {
+		const modelId = this.options.apiModelId ?? mistralDefaultModelId
+		return modelId.startsWith("codestral-")
+	}
+
+	async *streamFim(
+		prefix: string,
+		suffix: string,
+		_taskId?: string,
+		onUsage?: (usage: CompletionUsage) => void,
+	): AsyncGenerator<string> {
+		const { id: model, maxTokens } = this.getModel()
+
+		// Get the base URL for the model
+		// copy pasted from constructor, be sure to keep in sync
+		const baseUrl = model.startsWith("codestral-")
+			? this.options.mistralCodestralUrl || "https://codestral.mistral.ai"
+			: "https://api.mistral.ai"
+
+		const endpoint = new URL("v1/fim/completions", baseUrl)
+
+		const headers: Record<string, string> = {
+			...DEFAULT_HEADERS,
+			"Content-Type": "application/json",
+			Accept: "application/json",
+			Authorization: `Bearer ${this.options.mistralApiKey}`,
+		}
+
+		// temperature: 0.2 is mentioned as a sane example in mistral's docs
+		const temperature = 0.2
+		const requestMaxTokens = 256
+
+		const response = await fetch(endpoint, {
+			method: "POST",
+			body: JSON.stringify({
+				model,
+				prompt: prefix,
+				suffix,
+				max_tokens: Math.min(requestMaxTokens, maxTokens ?? requestMaxTokens),
+				temperature,
+				stream: true,
+			}),
+			headers,
+		})
+
+		if (!response.ok) {
+			const errorText = await response.text()
+			throw new Error(`FIM streaming failed: ${response.status} ${response.statusText} - ${errorText}`)
+		}
+
+		for await (const data of streamSse(response)) {
+			const content = data.choices?.[0]?.delta?.content
+			if (content) {
+				yield content
+			}
+
+			// Call usage callback when available
+			// Note: Mistral FIM API returns usage in the final chunk with prompt_tokens and completion_tokens
+			if (data.usage && onUsage) {
+				onUsage({
+					prompt_tokens: data.usage.prompt_tokens,
+					completion_tokens: data.usage.completion_tokens,
+					total_tokens: data.usage.total_tokens,
+				})
+			}
+		}
+	}
+	// kilocode_change end
 }
diff --git a/src/api/providers/openrouter.ts b/src/api/providers/openrouter.ts
index 97a7fef3ac4..5fb60bde6df 100644
--- a/src/api/providers/openrouter.ts
+++ b/src/api/providers/openrouter.ts
@@ -20,7 +20,7 @@ import { resolveToolProtocol } from "../../utils/resolveToolProtocol"
 import { TOOL_PROTOCOL } from "@roo-code/types"
 import { ApiStreamChunk } from "../transform/stream"
 import { convertToR1Format } from "../transform/r1-format"
-import { addCacheBreakpoints as addAnthropicCacheBreakpoints } from "../transform/caching/anthropic"
+import { addAnthropicCacheBreakpoints } from "../transform/caching/kilocode" // kilocode_change: own implementation that supports tool results
 import { addCacheBreakpoints as addGeminiCacheBreakpoints } from "../transform/caching/gemini"
 import type { OpenRouterReasoningParams } from "../transform/reasoning"
 import { getModelParams } from "../transform/model-params"
@@ -315,8 +315,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 		// kilocode_change start
 		const requestOptions = this.customRequestOptions(metadata) ?? { headers: {} }
 		if (modelId.startsWith("anthropic/")) {
-			requestOptions.headers["x-anthropic-beta"] =
-				"fine-grained-tool-streaming-2025-05-14,structured-outputs-2025-11-13"
+			requestOptions.headers["x-anthropic-beta"] = "fine-grained-tool-streaming-2025-05-14"
 		}
 		// kilocode_change end
 
@@ -566,8 +565,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 		// kilocode_change start
 		const requestOptions = this.customRequestOptions() ?? { headers: {} }
 		if (modelId.startsWith("anthropic/")) {
-			requestOptions.headers["x-anthropic-beta"] =
-				"fine-grained-tool-streaming-2025-05-14,structured-outputs-2025-11-13"
+			requestOptions.headers["x-anthropic-beta"] = "fine-grained-tool-streaming-2025-05-14"
 		}
 		// kilocode_change end
 
diff --git a/src/api/transform/caching/__tests__/kilocode.spec.ts b/src/api/transform/caching/__tests__/kilocode.spec.ts
new file mode 100644
index 00000000000..e4d1ddbde62
--- /dev/null
+++ b/src/api/transform/caching/__tests__/kilocode.spec.ts
@@ -0,0 +1,245 @@
+// npx vitest run src/api/transform/caching/__tests__/kilocode.spec.ts
+
+import OpenAI from "openai"
+
+import { addAnthropicCacheBreakpoints } from "../kilocode"
+
+describe("addAnthropicCacheBreakpoints (Kilocode)", () => {
+	const systemPrompt = "You are a helpful assistant."
+
+	it("should add a cache breakpoint to the system prompt", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: "Hello" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should add a breakpoint to the only user message if only one exists", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: "User message 1" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// System prompt gets cache control
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+
+		// Last user message gets cache control
+		expect(messages[1].content).toEqual([
+			{ type: "text", text: "User message 1", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should add breakpoints to system, last user, and user before last assistant", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: "User message 1" },
+			{ role: "assistant", content: "Assistant response 1" },
+			{ role: "user", content: "User message 2" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// System prompt gets cache control
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+
+		// User message before last assistant gets cache control
+		expect(messages[1].content).toEqual([
+			{ type: "text", text: "User message 1", cache_control: { type: "ephemeral" } },
+		])
+
+		// Assistant message should not be modified
+		expect(messages[2].content).toBe("Assistant response 1")
+
+		// Last user message gets cache control
+		expect(messages[3].content).toEqual([
+			{ type: "text", text: "User message 2", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should handle multiple assistant messages and find the user before the last one", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: "User message 1" },
+			{ role: "assistant", content: "Assistant response 1" },
+			{ role: "user", content: "User message 2" },
+			{ role: "assistant", content: "Assistant response 2" },
+			{ role: "user", content: "User message 3" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// System prompt gets cache control
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+
+		// First user message should NOT get cache control (not before last assistant)
+		expect(messages[1].content).toBe("User message 1")
+
+		// User message before last assistant (index 4) gets cache control
+		expect(messages[3].content).toEqual([
+			{ type: "text", text: "User message 2", cache_control: { type: "ephemeral" } },
+		])
+
+		// Last user message gets cache control
+		expect(messages[5].content).toEqual([
+			{ type: "text", text: "User message 3", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should handle tool messages the same as user messages", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: "User message 1" },
+			{ role: "assistant", content: "Let me use a tool" },
+			{ role: "tool", content: "Tool result", tool_call_id: "call_123" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// System prompt gets cache control
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+
+		// User message before last assistant gets cache control
+		expect(messages[1].content).toEqual([
+			{ type: "text", text: "User message 1", cache_control: { type: "ephemeral" } },
+		])
+
+		// Tool message (last user/tool) gets cache control
+		expect(messages[3].content).toEqual([
+			{ type: "text", text: "Tool result", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should handle array content and add cache control to last item", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{
+				role: "user",
+				content: [
+					{ type: "text", text: "First part" },
+					{ type: "image_url", image_url: { url: "data:image/png;base64,..." } },
+					{ type: "text", text: "Last part" },
+				],
+			},
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		expect(messages[1].content).toEqual([
+			{ type: "text", text: "First part" },
+			{ type: "image_url", image_url: { url: "data:image/png;base64,..." } },
+			{ type: "text", text: "Last part", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should add cache control to last item of array when it's an image", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{
+				role: "user",
+				content: [
+					{ type: "text", text: "Some text" },
+					{ type: "image_url", image_url: { url: "data:image/png;base64,..." } },
+				],
+			},
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// Cache control should be on the last item (the image)
+		expect(messages[1].content).toEqual([
+			{ type: "text", text: "Some text" },
+			{
+				type: "image_url",
+				image_url: { url: "data:image/png;base64,..." },
+				cache_control: { type: "ephemeral" },
+			},
+		])
+	})
+
+	it("should not add breakpoints when there are no user or tool messages", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "assistant", content: "Hello" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// Only system prompt should get cache control
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+
+		// Assistant message should not be modified
+		expect(messages[1].content).toBe("Hello")
+	})
+
+	it("should handle case when system prompt is found in messages array", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: "Different system prompt in array" },
+			{ role: "user", content: "Hello" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// Should use the system prompt found in messages, not the passed parameter
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: "Different system prompt in array", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should handle when last user message is also user before last assistant (same message)", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: "User message 1" },
+			{ role: "assistant", content: "Assistant response" },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// System prompt gets cache control
+		expect(messages[0].content).toEqual([
+			{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } },
+		])
+
+		// User message 1 is both before last assistant and is the last user message
+		// It should have cache control set (the function calls setCacheControl twice on same message)
+		expect(messages[1].content).toEqual([
+			{ type: "text", text: "User message 1", cache_control: { type: "ephemeral" } },
+		])
+	})
+
+	it("should handle empty messages array gracefully", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = []
+
+		// Should not throw
+		expect(() => addAnthropicCacheBreakpoints(systemPrompt, messages)).not.toThrow()
+	})
+
+	it("should handle empty array content", () => {
+		const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [
+			{ role: "system", content: systemPrompt },
+			{ role: "user", content: [] },
+		]
+
+		addAnthropicCacheBreakpoints(systemPrompt, messages)
+
+		// Empty array should remain empty (no last item to add cache control to)
+		expect(messages[1].content).toEqual([])
+	})
+})
diff --git a/src/api/transform/caching/kilocode.ts b/src/api/transform/caching/kilocode.ts
new file mode 100644
index 00000000000..b955ef620a8
--- /dev/null
+++ b/src/api/transform/caching/kilocode.ts
@@ -0,0 +1,47 @@
+import OpenAI from "openai"
+import { findLast, findLastIndex } from "../../../shared/array"
+
+function setCacheControl(message: OpenAI.ChatCompletionMessageParam) {
+	if (typeof message.content === "string") {
+		message.content = [
+			{
+				type: "text",
+				text: message.content,
+				// @ts-ignore-next-line
+				cache_control: { type: "ephemeral" },
+			},
+		]
+	} else if (Array.isArray(message.content)) {
+		const lastItem = message.content.at(-1)
+		if (lastItem) {
+			// @ts-ignore-next-line
+			lastItem.cache_control = { type: "ephemeral" }
+		}
+	}
+}
+
+export function addAnthropicCacheBreakpoints(
+	_systemPrompt: string,
+	messages: OpenAI.Chat.ChatCompletionMessageParam[],
+) {
+	const systemPrompt = messages.find((msg) => msg.role === "system")
+	if (systemPrompt) {
+		setCacheControl(systemPrompt)
+	}
+
+	const lastUserMessage = findLast(messages, (msg) => msg.role === "user" || msg.role === "tool")
+	if (lastUserMessage) {
+		setCacheControl(lastUserMessage)
+	}
+
+	const lastAssistantIndex = findLastIndex(messages, (msg) => msg.role === "assistant")
+	if (lastAssistantIndex >= 0) {
+		const previousUserMessage = findLast(
+			messages.slice(0, lastAssistantIndex),
+			(msg) => msg.role === "user" || msg.role === "tool",
+		)
+		if (previousUserMessage) {
+			setCacheControl(previousUserMessage)
+		}
+	}
+}
diff --git a/src/package.json b/src/package.json
index 28d26e7f29b..9c03f733942 100644
--- a/src/package.json
+++ b/src/package.json
@@ -3,7 +3,7 @@
 	"displayName": "%extension.displayName%",
 	"description": "%extension.description%",
 	"publisher": "kilocode",
-	"version": "4.143.0",
+	"version": "4.143.1",
 	"icon": "assets/icons/logo-outline-black.png",
 	"galleryBanner": {
 		"color": "#FFFFFF",
diff --git a/src/services/ghost/GhostModel.ts b/src/services/ghost/GhostModel.ts
index 50019360dd2..3e9a74ffea4 100644
--- a/src/services/ghost/GhostModel.ts
+++ b/src/services/ghost/GhostModel.ts
@@ -1,3 +1,4 @@
+// kilocode_change new file
 import { modelIdKeysByProvider, ProviderName } from "@roo-code/types"
 import { ApiHandler, buildApiHandler } from "../../api"
 import { ProviderSettingsManager } from "../../core/config/ProviderSettingsManager"
@@ -9,6 +10,33 @@ import { KilocodeOpenrouterHandler } from "../../api/providers/kilocode-openrout
 import { PROVIDERS } from "../../../webview-ui/src/components/settings/constants"
 import { ResponseMetaData } from "./types"
 
+/**
+ * Interface for handlers that support FIM (Fill-in-the-Middle) completions.
+ * Uses duck typing - any handler implementing these methods can be used for FIM.
+ */
+interface FimCapableHandler {
+	supportsFim(): boolean
+	streamFim(
+		prefix: string,
+		suffix: string,
+		taskId?: string,
+		onUsage?: (usage: CompletionUsage) => void,
+	): AsyncGenerator<string>
+	getModel(): { id: string; info: any; maxTokens?: number }
+	getTotalCost?(usage: CompletionUsage): number
+}
+
+/**
+ * Type guard to check if a handler supports FIM operations using duck typing.
+ */
+function isFimCapable(handler: ApiHandler): handler is ApiHandler & FimCapableHandler {
+	return (
+		typeof (handler as any).supportsFim === "function" &&
+		typeof (handler as any).streamFim === "function" &&
+		(handler as any).supportsFim() === true
+	)
+}
+
 // Convert PROVIDERS array to a lookup map for display names
 const PROVIDER_DISPLAY_NAMES = Object.fromEntries(PROVIDERS.map(({ value, label }) => [value, label])) as Record<
 	ProviderName,
@@ -92,15 +120,13 @@ export class GhostModel {
 			return false
 		}
 
-		if (this.apiHandler instanceof KilocodeOpenrouterHandler) {
-			return this.apiHandler.supportsFim()
-		}
-
-		return false
+		// Use duck typing to check if the handler supports FIM
+		return isFimCapable(this.apiHandler)
 	}
 
 	/**
-	 * Generate FIM completion using the FIM API endpoint
+	 * Generate FIM completion using the FIM API endpoint.
+	 * Uses duck typing to support any handler that implements supportsFim() and streamFim().
 	 */
 	public async generateFimResponse(
 		prefix: string,
@@ -113,12 +139,8 @@ export class GhostModel {
 			throw new Error("API handler is not initialized. Please check your configuration.")
 		}
 
-		if (!(this.apiHandler instanceof KilocodeOpenrouterHandler)) {
-			throw new Error("FIM is only supported for KiloCode provider")
-		}
-
-		if (!this.apiHandler.supportsFim()) {
-			throw new Error("Current model does not support FIM completions")
+		if (!isFimCapable(this.apiHandler)) {
+			throw new Error("Current provider/model does not support FIM completions")
 		}
 
 		console.log("USED MODEL (FIM)", this.apiHandler.getModel())
@@ -131,7 +153,9 @@ export class GhostModel {
 			onChunk(chunk)
 		}
 
-		const cost = usage ? this.apiHandler.getTotalCost(usage) : 0
+		// Calculate cost if the handler supports it (duck typing)
+		const cost =
+			usage && typeof this.apiHandler.getTotalCost === "function" ? this.apiHandler.getTotalCost(usage) : 0
 		const inputTokens = usage?.prompt_tokens ?? 0
 		const outputTokens = usage?.completion_tokens ?? 0
 		const cacheReadTokens = usage?.prompt_tokens_details?.cached_tokens ?? 0
diff --git a/src/services/ghost/chat-autocomplete/ChatTextAreaAutocomplete.ts b/src/services/ghost/chat-autocomplete/ChatTextAreaAutocomplete.ts
index 334c426392f..4e961d959b8 100644
--- a/src/services/ghost/chat-autocomplete/ChatTextAreaAutocomplete.ts
+++ b/src/services/ghost/chat-autocomplete/ChatTextAreaAutocomplete.ts
@@ -2,13 +2,10 @@ import * as vscode from "vscode"
 import { GhostModel } from "../GhostModel"
 import { ProviderSettingsManager } from "../../../core/config/ProviderSettingsManager"
 import { AutocompleteContext, VisibleCodeContext } from "../types"
-import { ApiStreamChunk } from "../../../api/transform/stream"
 import { removePrefixOverlap } from "../../continuedev/core/autocomplete/postprocessing/removePrefixOverlap.js"
 import { AutocompleteTelemetry } from "../classic-auto-complete/AutocompleteTelemetry"
+import { postprocessGhostSuggestion } from "../classic-auto-complete/uselessSuggestionFilter"
 
-/**
- * Service for providing FIM-based autocomplete suggestions in ChatTextArea
- */
 export class ChatTextAreaAutocomplete {
 	private model: GhostModel
 	private providerSettingsManager: ProviderSettingsManager
@@ -24,14 +21,6 @@ export class ChatTextAreaAutocomplete {
 		return this.model.reload(this.providerSettingsManager)
 	}
 
-	/**
-	 * Check if we can successfully make a FIM request.
-	 * Validates that model is loaded, has valid API handler, and supports FIM.
-	 */
-	isFimAvailable(): boolean {
-		return this.model.hasValidCredentials() && this.model.supportsFim()
-	}
-
 	async getCompletion(userText: string, visibleCodeContext?: VisibleCodeContext): Promise<{ suggestion: string }> {
 		const startTime = Date.now()
 
@@ -147,9 +136,6 @@ TASK: Complete the user's message naturally.
 - Return ONLY the completion text (what comes next), no explanations.`
 	}
 
-	/**
-	 * Build the prefix for FIM completion with visible code context and additional sources
-	 */
 	private async buildPrefix(userText: string, visibleCodeContext?: VisibleCodeContext): Promise<string> {
 		const contextParts: string[] = []
 
@@ -179,9 +165,6 @@ TASK: Complete the user's message naturally.
 		return contextParts.join("\n")
 	}
 
-	/**
-	 * Get clipboard content for context
-	 */
 	private async getClipboardContext(): Promise<string | null> {
 		try {
 			const text = await vscode.env.clipboard.readText()
@@ -195,51 +178,30 @@ TASK: Complete the user's message naturally.
 		return null
 	}
 
-	/**
-	 * Clean the suggestion by removing any leading repetition of user text
-	 * and filtering out unwanted patterns like comments
-	 */
-	private cleanSuggestion(suggestion: string, userText: string): string {
-		let cleaned = suggestion
-
-		cleaned = removePrefixOverlap(cleaned, userText)
-
-		const firstNewline = cleaned.indexOf("\n")
-		if (firstNewline !== -1) {
-			cleaned = cleaned.substring(0, firstNewline)
-		}
-		cleaned = cleaned.trimEnd() // Do NOT trim the end of the suggestion
+	public cleanSuggestion(suggestion: string, userText: string): string {
+		let cleaned = postprocessGhostSuggestion({
+			suggestion: removePrefixOverlap(suggestion, userText),
+			prefix: userText,
+			suffix: "", // Chat textarea has no suffix
+			model: this.model.getModelName() ?? "unknown",
+		})
 
-		// Filter out suggestions that start with comment patterns
-		// This happens because the context uses // prefixes for labels
-		if (this.isUnwantedSuggestion(cleaned)) {
+		if (cleaned === undefined) {
 			return ""
 		}
 
-		return cleaned
-	}
-
-	/**
-	 * Check if suggestion should be filtered out
-	 */
-	public isUnwantedSuggestion(suggestion: string): boolean {
-		// Filter comment-starting suggestions
-		if (suggestion.startsWith("//") || suggestion.startsWith("/*") || suggestion.startsWith("*")) {
-			return true
-		}
-
 		// Filter suggestions that look like code rather than natural language
-		// This includes preprocessor directives (#include) and markdown headers
-		// Chat is for natural language, not formatted documents
-		if (suggestion.startsWith("#")) {
-			return true
+		if (cleaned.match(/^(\/\/|\/\*|\*|#)/)) {
+			return ""
 		}
 
-		// Filter suggestions that are just punctuation or whitespace
-		if (suggestion.length < 2 || /^[\s\p{P}]+$/u.test(suggestion)) {
-			return true
+		// Chat-specific: truncate at first newline for single-line suggestions
+		const firstNewline = cleaned.indexOf("\n")
+		if (firstNewline !== -1) {
+			cleaned = cleaned.substring(0, firstNewline)
 		}
+		cleaned = cleaned.trimEnd()
 
-		return false
+		return cleaned
 	}
 }
diff --git a/src/services/ghost/chat-autocomplete/__tests__/ChatTextAreaAutocomplete.spec.ts b/src/services/ghost/chat-autocomplete/__tests__/ChatTextAreaAutocomplete.spec.ts
index 26d1a26dff4..0d6bd41d254 100644
--- a/src/services/ghost/chat-autocomplete/__tests__/ChatTextAreaAutocomplete.spec.ts
+++ b/src/services/ghost/chat-autocomplete/__tests__/ChatTextAreaAutocomplete.spec.ts
@@ -45,46 +45,41 @@ describe("ChatTextAreaAutocomplete", () => {
 		})
 	})
 
-	describe("isFimAvailable", () => {
-		it("should return false when model is not loaded", () => {
-			const result = autocomplete.isFimAvailable()
-			expect(result).toBe(false)
-		})
-	})
-
-	describe("isUnwantedSuggestion", () => {
-		it("should filter code patterns (comments, preprocessor, short/empty)", () => {
-			const filter = autocomplete.isUnwantedSuggestion.bind(autocomplete)
-
-			// Comments
-			expect(filter("// comment")).toBe(true)
-			expect(filter("/* comment")).toBe(true)
-			expect(filter("*")).toBe(true)
+	describe("cleanSuggestion", () => {
+		it("should filter code patterns (comments, preprocessor)", () => {
+			// Comments - filtered by the regex check in cleanSuggestion
+			expect(autocomplete.cleanSuggestion("// comment", "")).toBe("")
+			expect(autocomplete.cleanSuggestion("/* comment", "")).toBe("")
+			expect(autocomplete.cleanSuggestion("* something", "")).toBe("")
 
 			// Code patterns
-			expect(filter("#include")).toBe(true)
-			expect(filter("# Header")).toBe(true)
+			expect(autocomplete.cleanSuggestion("#include", "")).toBe("")
+			expect(autocomplete.cleanSuggestion("# Header", "")).toBe("")
+		})
 
-			// Meaningless content
-			expect(filter("")).toBe(true)
-			expect(filter("a")).toBe(true)
-			expect(filter("...")).toBe(true)
+		it("should filter empty content", () => {
+			// Empty content is filtered by postprocessGhostSuggestion
+			expect(autocomplete.cleanSuggestion("", "")).toBe("")
 		})
 
 		it("should accept natural language suggestions", () => {
-			const filter = autocomplete.isUnwantedSuggestion.bind(autocomplete)
-
-			expect(filter("Hello world")).toBe(false)
-			expect(filter("Can you help me")).toBe(false)
-			expect(filter("test123")).toBe(false)
-			expect(filter("What's up?")).toBe(false)
+			expect(autocomplete.cleanSuggestion("Hello world", "")).toBe("Hello world")
+			expect(autocomplete.cleanSuggestion("Can you help me", "")).toBe("Can you help me")
+			expect(autocomplete.cleanSuggestion("test123", "")).toBe("test123")
+			expect(autocomplete.cleanSuggestion("What's up?", "")).toBe("What's up?")
 		})
 
 		it("should accept symbols in middle of text", () => {
-			const filter = autocomplete.isUnwantedSuggestion.bind(autocomplete)
+			expect(autocomplete.cleanSuggestion("Text with # in middle", "")).toBe("Text with # in middle")
+			expect(autocomplete.cleanSuggestion("Hello // but not a comment", "")).toBe("Hello // but not a comment")
+		})
+
+		it("should truncate at first newline", () => {
+			expect(autocomplete.cleanSuggestion("First line\nSecond line", "")).toBe("First line")
+		})
 
-			expect(filter("Text with # in middle")).toBe(false)
-			expect(filter("Hello // but not a comment")).toBe(false)
+		it("should remove prefix overlap", () => {
+			expect(autocomplete.cleanSuggestion("Hello world", "Hello ")).toBe("world")
 		})
 	})
 })
diff --git a/src/services/ghost/classic-auto-complete/__tests__/uselessSuggestionFilter.test.ts b/src/services/ghost/classic-auto-complete/__tests__/uselessSuggestionFilter.test.ts
index d6b888b3cf3..b0ee68f4092 100644
--- a/src/services/ghost/classic-auto-complete/__tests__/uselessSuggestionFilter.test.ts
+++ b/src/services/ghost/classic-auto-complete/__tests__/uselessSuggestionFilter.test.ts
@@ -250,4 +250,22 @@ return 1
 `),
 		).toBe(true)
 	})
+
+	it("treats as duplication when suggestion repeats the same phrase from the prefix", () => {
+		// User types "We are going to start from" and suggestion repeats "the beginning. We are going to start from the beginning..."
+		expect(
+			isDuplication(
+				`We are going to start from <<<the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning.>>>`,
+			),
+		).toBe(true)
+	})
+
+	it("treats as duplication when suggestion ends with non-word characters but still has repetitive phrases", () => {
+		// Suggestion ends with "..." but the repeating phrase should still be detected
+		expect(
+			isDuplication(
+				`<<<the beginning. We are going to start from the beginning. We are going to start from the beginning. We are going to start from the beginning...>>>`,
+			),
+		).toBe(true)
+	})
 })
diff --git a/src/services/ghost/classic-auto-complete/uselessSuggestionFilter.ts b/src/services/ghost/classic-auto-complete/uselessSuggestionFilter.ts
index a32334e12ba..114b8d7a2b7 100644
--- a/src/services/ghost/classic-auto-complete/uselessSuggestionFilter.ts
+++ b/src/services/ghost/classic-auto-complete/uselessSuggestionFilter.ts
@@ -18,6 +18,11 @@ export function suggestionConsideredDuplication(params: AutocompleteSuggestion):
 		return true
 	}
 
+	// Check if the suggestion contains repetitive phrases that continue from the prefix
+	if (containsRepetitivePhraseFromPrefix(params)) {
+		return true
+	}
+
 	// When the suggestion isn't a full line or set of lines, normalize by including
 	// the rest of the line in the prefix/suffix and check with the completed line(s)
 	const normalized = normalizeToCompleteLine(params)
@@ -58,6 +63,43 @@ function DuplicatesFromEdgeLines(params: AutocompleteSuggestion): boolean {
 	return false
 }
 
+/**
+ * Detects when a suggestion's tail is repeating itself - a common LLM failure mode.
+ * For example: "the beginning. We are going to start from the beginning. We are going to start from the beginning..."
+ * The suggestion gets stuck in a loop repeating the same phrase.
+ */
+function containsRepetitivePhraseFromPrefix(params: AutocompleteSuggestion): boolean {
+	const suggestion = params.suggestion
+	const phraseLength = 30 // Phrase length to check for repetition
+	const minRepetitions = 3 // Minimum number of repetitions to consider it repetitive
+
+	// Only check suggestions that are long enough to contain repetition
+	if (suggestion.length < phraseLength * minRepetitions) {
+		return false
+	}
+
+	// Strip non-word characters from the right before selecting the tail
+	// This handles cases like "...the beginning..." where trailing punctuation would break detection
+	const strippedSuggestion = suggestion.replace(/\W+$/, "")
+
+	if (strippedSuggestion.length < phraseLength) {
+		return false
+	}
+
+	// Extract a phrase from the end of the stripped suggestion
+	const phrase = strippedSuggestion.slice(-phraseLength)
+
+	// Count how many times this phrase appears in the original suggestion
+	let count = 0
+	let pos = 0
+	while ((pos = suggestion.indexOf(phrase, pos)) !== -1) {
+		count++
+		pos += phrase.length
+	}
+
+	return count >= minRepetitions
+}
+
 /**
  * Normalizes partial-line suggestions by expanding them to the full current line:
  * (prefix line tail) + (suggestion first line) + (suffix line head).