diff --git a/.cursor/rules/swift-build.mdc b/.cursor/rules/swift-build.mdc
index a7e196b52..85ece18dc 100644
--- a/.cursor/rules/swift-build.mdc
+++ b/.cursor/rules/swift-build.mdc
@@ -6,12 +6,26 @@ alwaysApply: false
 
 # Building OsaurusCore
 
-The xcode workspace has pre-existing build failures in external dependencies (`mlx-swift-lm`, `IkigaJSON`). Never use `xcodebuild` to verify changes — it will always fail on those deps and waste tokens.
+Use focused package tests while iterating, and use CI-parity `xcodebuild` only when you need to reproduce the GitHub Actions `test-core` job.
 
-Instead, compile only the OsaurusCore package sources (no linking) to verify your changes:
+Fast local checks from the repository root:
 
 ```bash
-cd /Users/tpae/dev/osaurus/Packages/OsaurusCore && swift build 2>&1 | grep -E "error:" | grep -v "IkigaJSON"
+swift test --package-path Packages/OsaurusCore
+swift test --package-path Packages/OsaurusCLI --parallel
+swift-format lint --strict --recursive Packages App
 ```
 
-If the filtered output is empty, your code compiles cleanly.
+CI-parity check from the repository root:
+
+```bash
+make ci-test
+```
+
+If you only need a compile smoke test for core sources, this is acceptable:
+
+```bash
+swift build --package-path Packages/OsaurusCore
+```
+
+Do not hardcode local absolute paths in docs or scripts. Use repo-root-relative commands unless a tool explicitly requires an absolute path.
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 159f9094e..85c4a5fb7 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -20,7 +20,7 @@ If UI updated, add before/after.
 
 ## Checklist
 
-- [ ] I have read `CONTRIBUTING.md`
+- [ ] I have read `docs/CONTRIBUTING.md`
 - [ ] I added/updated tests where reasonable
 - [ ] I updated docs/README as needed
-- [ ] I verified build on macOS with Xcode 16.4+
+- [ ] I verified build on macOS with a Swift 6.2-capable Xcode toolchain
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b4b0b98d6..a13fbf584 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -26,7 +26,7 @@ permissions:
 env:
   # Bump to invalidate every cache entry without source surgery (e.g., after a
   # known-bad cache or an Xcode toolchain upgrade we want to flush manually).
-  CACHE_SALT: v2-vmlx-5b84387
+  CACHE_SALT: v3-pr-cold-deriveddata
   # Pin Xcode so cache keys are stable across runner image bumps. When you
   # need to upgrade, change here AND in setup-xcode below.
   XCODE_VERSION: "26.4.1"
@@ -83,9 +83,11 @@ jobs:
 
       - name: Restore DerivedData cache
         id: dd-cache
-        # Always restore so `cache-primary-key` is populated for the save
-        # step at the bottom (the wipe step below handles forced cold
-        # builds without preventing main from repopulating the cache).
+        # Restore only on main pushes / manual maintainer runs. Pull requests
+        # intentionally cold-build DerivedData: exact restore-key hits have
+        # still produced stale Swift modules whose C-module dependencies are
+        # missing when Xcode later compiles EventSource.
+        if: ${{ github.event_name != 'pull_request' }}
         uses: actions/cache/restore@v5
         with:
           path: ~/Library/Developer/Xcode/DerivedData
@@ -97,13 +99,15 @@ jobs:
           restore-keys: |
             dd-${{ runner.os }}-${{ env.CACHE_SALT }}-xcode${{ env.XCODE_VERSION }}-
 
-      # Make "clear the build cache" a one-click operation. Two triggers:
-      #   1. `github.run_attempt != '1'` — i.e. a re-run. The default
+      # Make "clear the build cache" a one-click operation. Three triggers:
+      #   1. Pull requests — always cold-build DerivedData so PRs never trust
+      #      a cached Xcode build product from another ref.
+      #   2. `github.run_attempt != '1'` — i.e. a re-run. The default
       #      "Re-run failed jobs" button is the natural place for someone
       #      who just saw a build failure to land, so we make that the
       #      intuitive escape hatch for cache poison: the first attempt
       #      uses the cache (fast); any re-run forces a cold compile.
-      #   2. `workflow_dispatch.clear_cache=true` — manual force-cold on
+      #   3. `workflow_dispatch.clear_cache=true` — manual force-cold on
       #      a fresh run (e.g. validating a CACHE_SALT bump before PRs
       #      start hitting it).
       #
@@ -116,18 +120,18 @@ jobs:
       # every re-run cost ~2 min in PR #951 run 24937664669 — wasted
       # budget that contributed to the 30-min cold-build cancellation.
       #
-      # We wipe AFTER the restore step (rather than skipping the restore)
-      # so `steps.dd-cache.outputs.cache-primary-key` stays populated and
-      # the `Save DerivedData cache` step at the bottom can still
-      # repopulate the cache on a successful `main` run.
-      - name: Wipe restored DerivedData (re-run or workflow_dispatch clear_cache)
-        if: ${{ github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
+      # On main/manual runs we wipe AFTER the restore step (rather than
+      # skipping the restore) so `steps.dd-cache.outputs.cache-primary-key`
+      # stays populated and the `Save DerivedData cache` step at the bottom
+      # can still repopulate the cache on a successful `main` run.
+      - name: Wipe restored DerivedData (PR, re-run, or workflow_dispatch clear_cache)
+        if: ${{ github.event_name == 'pull_request' || github.run_attempt != '1' || (github.event_name == 'workflow_dispatch' && inputs.clear_cache) }}
         run: |
-          REASON="run_attempt=${{ github.run_attempt }}"
+          REASON="event=${{ github.event_name }}, run_attempt=${{ github.run_attempt }}"
           if [ "${{ github.event_name }}" = "workflow_dispatch" ] && [ "${{ inputs.clear_cache }}" = "true" ]; then
             REASON="$REASON, workflow_dispatch clear_cache=true"
           fi
-          echo "::notice title=Cold build forced::Wiping restored DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved). To re-run with the warm cache instead, push a new commit or trigger a fresh run."
+          echo "::notice title=Cold build forced::Wiping DerivedData before build ($REASON). SPM cache preserved (it's source-only and pinned by Package.resolved)."
           rm -rf "$HOME/Library/Developer/Xcode/DerivedData"
 
       - name: Resolve dependencies
@@ -248,7 +252,7 @@ jobs:
                 echo
                 echo "**\`run_attempt > 1\` AND \`cache-hit: false\`?** That's the deliberate cold-rebuild path triggered by **Re-run failed jobs** — see the \`Wipe restored DerivedData\` step in this job. If the cold build is exhausting the 45-min budget on every re-run, the codebase has outgrown the budget; bump \`timeout-minutes\` and update its comment block, OR move warm-cache priming to a nightly \`main\` job so PRs always warm-start."
                 echo
-                echo "**Suspect cache poisoning on a fresh attempt?** Click **Re-run failed jobs** — re-runs automatically wipe DerivedData (the SPM cache is preserved because it's pinned by \`Package.resolved\` and can't be poisoned)."
+                echo "**Suspect cache poisoning on a fresh attempt?** Pull requests already cold-build DerivedData; main/manual re-runs wipe DerivedData automatically while preserving the pinned SPM source cache."
               } >> "$GITHUB_STEP_SUMMARY"
             else
               # Mode B.
diff --git a/Packages/OsaurusCore/Models/API/OpenAIAPI.swift b/Packages/OsaurusCore/Models/API/OpenAIAPI.swift
index cde1cfc2c..cc4ae97ce 100644
--- a/Packages/OsaurusCore/Models/API/OpenAIAPI.swift
+++ b/Packages/OsaurusCore/Models/API/OpenAIAPI.swift
@@ -249,6 +249,9 @@ struct ChatMessage: Codable, Sendable {
     let tool_calls: [ToolCall]?
     /// Required for role=="tool" messages to associate with a prior tool call
     let tool_call_id: String?
+    /// Provider-specific reasoning text that some thinking APIs require on
+    /// follow-up requests after an assistant tool-call turn.
+    let reasoning_content: String?
 
     /// Extract image URLs from content parts (supports both data URLs and http URLs)
     var imageUrls: [String] {
@@ -306,6 +309,7 @@ extension ChatMessage {
         case content
         case tool_calls
         case tool_call_id
+        case reasoning_content
     }
 
     public init(from decoder: Decoder) throws {
@@ -313,6 +317,7 @@ extension ChatMessage {
         self.role = try container.decode(String.self, forKey: .role)
         self.tool_calls = try? container.decode([ToolCall].self, forKey: .tool_calls)
         self.tool_call_id = try? container.decode(String.self, forKey: .tool_call_id)
+        self.reasoning_content = try? container.decode(String.self, forKey: .reasoning_content)
 
         if let stringContent = try? container.decode(String.self, forKey: .content) {
             self.content = stringContent
@@ -357,6 +362,7 @@ extension ChatMessage {
         // Note: content is intentionally omitted when nil (e.g., assistant messages with tool_calls)
         try container.encodeIfPresent(tool_calls, forKey: .tool_calls)
         try container.encodeIfPresent(tool_call_id, forKey: .tool_call_id)
+        try container.encodeIfPresent(reasoning_content, forKey: .reasoning_content)
     }
 }
 
@@ -367,15 +373,23 @@ extension ChatMessage {
         self.contentParts = nil
         self.tool_calls = nil
         self.tool_call_id = nil
+        self.reasoning_content = nil
     }
 
     /// Initialize with optional tool calls and tool call id
-    init(role: String, content: String?, tool_calls: [ToolCall]?, tool_call_id: String?) {
+    init(
+        role: String,
+        content: String?,
+        tool_calls: [ToolCall]?,
+        tool_call_id: String?,
+        reasoning_content: String? = nil
+    ) {
         self.role = role
         self.content = content
         self.contentParts = nil
         self.tool_calls = tool_calls
         self.tool_call_id = tool_call_id
+        self.reasoning_content = reasoning_content
     }
 
     /// Initialize with multimodal content (text and images)
@@ -399,6 +413,7 @@ extension ChatMessage {
         self.content = text.isEmpty ? nil : text
         self.tool_calls = nil
         self.tool_call_id = nil
+        self.reasoning_content = nil
     }
 
     /// Multimodal init covering image + audio + video. Used by the
@@ -450,6 +465,7 @@ extension ChatMessage {
         self.content = text.isEmpty ? nil : text
         self.tool_calls = nil
         self.tool_call_id = nil
+        self.reasoning_content = nil
     }
 }
 
@@ -824,13 +840,12 @@ public enum JSONValue: Codable, Sendable, Equatable {
 
 extension JSONValue {
     /// Convert JSONValue to Sendable-compatible value for Jinja chat templates.
-    /// Null values are dropped from dictionaries because Jinja's `Value(any:)` cannot
-    /// handle `NSNull` and throws a runtime error. JSON Schema treats a missing key
-    /// the same as `null`, so this is semantically lossless for tool specs.
-    var sendableValue: any Sendable {
+    /// Null values are dropped because Jinja's `Value(any:)` cannot handle
+    /// null/optional placeholders inside erased Swift containers.
+    var sendableValue: (any Sendable)? {
         switch self {
         case .null:
-            return NSNull()
+            return nil
         case .bool(let b):
             return b
         case .number(let n):
@@ -838,12 +853,13 @@ extension JSONValue {
         case .string(let s):
             return s
         case .array(let arr):
-            return arr.map { $0.sendableValue }
+            return arr.compactMap { $0.sendableValue }
         case .object(let obj):
             var dict: [String: any Sendable] = [:]
             for (k, v) in obj {
-                if case .null = v { continue }
-                dict[k] = v.sendableValue
+                if let converted = v.sendableValue {
+                    dict[k] = converted
+                }
             }
             return dict
         }
@@ -881,8 +897,8 @@ extension ToolFunction {
         if let description {
             fn["description"] = description
         }
-        if let parameters {
-            fn["parameters"] = parameters.sendableValue
+        if let parameters, let converted = parameters.sendableValue {
+            fn["parameters"] = converted
         }
         return fn
     }
diff --git a/Packages/OsaurusCore/Models/Chat/ChatSessionStore.swift b/Packages/OsaurusCore/Models/Chat/ChatSessionStore.swift
index 1c1a054ba..e74d7691a 100644
--- a/Packages/OsaurusCore/Models/Chat/ChatSessionStore.swift
+++ b/Packages/OsaurusCore/Models/Chat/ChatSessionStore.swift
@@ -72,6 +72,11 @@ enum ChatSessionStore {
             print("[ChatSessionStore] Failed to open chat-history database: \(error)")
             return
         }
+        #if DEBUG
+            if RuntimeEnvironment.isUnderTests, OsaurusPaths.overrideRoot == nil {
+                return
+            }
+        #endif
         LegacySessionImporter.runIfNeeded()
     }
 }
diff --git a/Packages/OsaurusCore/Services/Chat/ChatEngine.swift b/Packages/OsaurusCore/Services/Chat/ChatEngine.swift
index 796ac0371..6e6287ed1 100644
--- a/Packages/OsaurusCore/Services/Chat/ChatEngine.swift
+++ b/Packages/OsaurusCore/Services/Chat/ChatEngine.swift
@@ -177,7 +177,8 @@ actor ChatEngine: Sendable, ChatEngineProtocol {
             role: "assistant",
             content: nil,
             tool_calls: toolCalls,
-            tool_call_id: nil
+            tool_call_id: nil,
+            reasoning_content: invocations.compactMap(\.reasoningContent).first
         )
         let choice = ChatChoice(index: 0, message: assistant, finish_reason: "tool_calls")
         let usage = Usage(prompt_tokens: inputTokens, completion_tokens: 0, total_tokens: inputTokens)
diff --git a/Packages/OsaurusCore/Services/Inference/ModelService.swift b/Packages/OsaurusCore/Services/Inference/ModelService.swift
index a6591b905..e097b828c 100644
--- a/Packages/OsaurusCore/Services/Inference/ModelService.swift
+++ b/Packages/OsaurusCore/Services/Inference/ModelService.swift
@@ -77,12 +77,22 @@ struct ServiceToolInvocation: Error, Sendable {
     let toolCallId: String?
     /// Optional thought signature for Gemini thinking-mode models (e.g. Gemini 2.5)
     let geminiThoughtSignature: String?
+    /// Provider reasoning text that must be echoed on assistant tool-call
+    /// messages for APIs such as DeepSeek thinking mode.
+    let reasoningContent: String?
 
-    init(toolName: String, jsonArguments: String, toolCallId: String? = nil, geminiThoughtSignature: String? = nil) {
+    init(
+        toolName: String,
+        jsonArguments: String,
+        toolCallId: String? = nil,
+        geminiThoughtSignature: String? = nil,
+        reasoningContent: String? = nil
+    ) {
         self.toolName = toolName
         self.jsonArguments = jsonArguments
         self.toolCallId = toolCallId
         self.geminiThoughtSignature = geminiThoughtSignature
+        self.reasoningContent = reasoningContent
     }
 }
 
diff --git a/Packages/OsaurusCore/Services/Provider/RemoteProviderService.swift b/Packages/OsaurusCore/Services/Provider/RemoteProviderService.swift
index ec910884e..641806f3e 100644
--- a/Packages/OsaurusCore/Services/Provider/RemoteProviderService.swift
+++ b/Packages/OsaurusCore/Services/Provider/RemoteProviderService.swift
@@ -218,7 +218,7 @@ public actor RemoteProviderService: ToolCapableService {
             configuredProviderType: provider.providerType,
             request: request
         )
-        let (content, _) = try parseResponse(data, providerType: responseProviderType)
+        let (content, _, _) = try parseResponse(data, providerType: responseProviderType)
         return content ?? ""
     }
 
@@ -307,7 +307,7 @@ public actor RemoteProviderService: ToolCapableService {
             configuredProviderType: provider.providerType,
             request: request
         )
-        let (content, toolCalls) = try parseResponse(data, providerType: responseProviderType)
+        let (content, toolCalls, reasoningContent) = try parseResponse(data, providerType: responseProviderType)
 
         // Check for tool calls
         if let toolCalls = toolCalls, let firstCall = toolCalls.first {
@@ -315,7 +315,8 @@ public actor RemoteProviderService: ToolCapableService {
                 toolName: firstCall.function.name,
                 jsonArguments: firstCall.function.arguments,
                 toolCallId: firstCall.id,
-                geminiThoughtSignature: firstCall.geminiThoughtSignature
+                geminiThoughtSignature: firstCall.geminiThoughtSignature,
+                reasoningContent: reasoningContent
             )
         }
 
@@ -653,6 +654,9 @@ public actor RemoteProviderService: ToolCapableService {
         /// Yielded text content. Only used when `trackContent` is `true`
         /// (streamWithTools, for the inline tool-call detection fallback).
         var accumulatedContent: String = ""
+        /// Reasoning content streamed before a tool call. DeepSeek requires
+        /// callers to echo this on the assistant tool-call message.
+        var accumulatedReasoningContent: String = ""
 
         let stopSequences: [String]
         let trackContent: Bool
@@ -692,9 +696,13 @@ public actor RemoteProviderService: ToolCapableService {
     /// a successful call to lock into history.
     static func resolveAccumulatedToolCall(
         from accumulated: [Int: StreamingState.ToolSlot],
+        reasoningContent: String,
         finishMarker: String
     ) -> AccumulatedToolCallResult {
-        guard let (invocation, wasRepaired) = makeToolInvocation(from: accumulated) else {
+        guard let (invocation, wasRepaired) = makeToolInvocation(
+            from: accumulated,
+            reasoningContent: reasoningContent
+        ) else {
             return .none
         }
         if wasRepaired {
@@ -865,6 +873,7 @@ public actor RemoteProviderService: ToolCapableService {
             if finishReason == "STOP" || finishReason == "MAX_TOKENS" {
                 switch resolveAccumulatedToolCall(
                     from: state.accumulatedToolCalls,
+                    reasoningContent: state.accumulatedReasoningContent,
                     finishMarker: "gemini=\(finishReason)"
                 ) {
                 case .none: return .finishNormal
@@ -931,6 +940,7 @@ public actor RemoteProviderService: ToolCapableService {
         case "message_stop":
             switch resolveAccumulatedToolCall(
                 from: state.accumulatedToolCalls,
+                reasoningContent: state.accumulatedReasoningContent,
                 finishMarker: "anthropic message_stop"
             ) {
             case .none: return .finishNormal
@@ -1026,6 +1036,7 @@ public actor RemoteProviderService: ToolCapableService {
             state.lastFinishReason = "completed"
             switch resolveAccumulatedToolCall(
                 from: state.accumulatedToolCalls,
+                reasoningContent: state.accumulatedReasoningContent,
                 finishMarker: "response.completed"
             ) {
             case .none: return .finishNormal
@@ -1081,10 +1092,10 @@ public actor RemoteProviderService: ToolCapableService {
         // (DeepSeek, Qwen, Together, vLLM). Forwarded as a sentinel so the
         // SSE layer routes it onto `reasoning_content` and ChatView places
         // it in the Think panel — without ever emitting `<think>` literals.
-        if state.accumulatedToolCalls.isEmpty,
-            let reasoning = chunk.choices.first?.delta.reasoning_content,
-            !reasoning.isEmpty
-        {
+        if let reasoning = chunk.choices.first?.delta.reasoning_content,
+            !reasoning.isEmpty {
+            state.accumulatedReasoningContent += reasoning
+            guard state.accumulatedToolCalls.isEmpty else { return .continue }
             yield(StreamingReasoningHint.encode(reasoning))
         }
 
@@ -1104,6 +1115,7 @@ public actor RemoteProviderService: ToolCapableService {
             state.lastFinishReason = finishReason
             switch resolveAccumulatedToolCall(
                 from: state.accumulatedToolCalls,
+                reasoningContent: state.accumulatedReasoningContent,
                 finishMarker: "finish_reason=\(finishReason)"
             ) {
             case .none: break
@@ -1127,6 +1139,7 @@ public actor RemoteProviderService: ToolCapableService {
     ) {
         switch resolveAccumulatedToolCall(
             from: state.accumulatedToolCalls,
+            reasoningContent: state.accumulatedReasoningContent,
             finishMarker: finishMarker
         ) {
         case .ready(let invocation):
@@ -1343,7 +1356,8 @@ public actor RemoteProviderService: ToolCapableService {
     /// malformed and had to be structurally closed — strong signal that the stream
     /// was truncated mid-argument, especially when no `finish_reason` was ever seen.
     private static func makeToolInvocation(
-        from accumulated: [Int: (id: String?, name: String?, args: String, thoughtSignature: String?)]
+        from accumulated: [Int: (id: String?, name: String?, args: String, thoughtSignature: String?)],
+        reasoningContent: String = ""
     ) -> (invocation: ServiceToolInvocation, wasRepaired: Bool)? {
         guard let first = accumulated.min(by: { $0.key < $1.key }),
             let name = first.value.name
@@ -1355,7 +1369,8 @@ public actor RemoteProviderService: ToolCapableService {
                 toolName: name,
                 jsonArguments: validated.json,
                 toolCallId: first.value.id,
-                geminiThoughtSignature: first.value.thoughtSignature
+                geminiThoughtSignature: first.value.thoughtSignature,
+                reasoningContent: reasoningContent.isEmpty ? nil : reasoningContent
             ),
             validated.wasRepaired
         )
@@ -1863,7 +1878,7 @@ public actor RemoteProviderService: ToolCapableService {
     private func parseResponse(
         _ data: Data,
         providerType: RemoteProviderType
-    ) throws -> (content: String?, toolCalls: [ToolCall]?) {
+    ) throws -> (content: String?, toolCalls: [ToolCall]?, reasoningContent: String?) {
         switch providerType {
         case .anthropic:
             let response = try JSONDecoder().decode(AnthropicMessagesResponse.self, from: data)
@@ -1887,13 +1902,14 @@ public actor RemoteProviderService: ToolCapableService {
                 }
             }
 
-            return (textContent.isEmpty ? nil : textContent, toolCalls.isEmpty ? nil : toolCalls)
+            return (textContent.isEmpty ? nil : textContent, toolCalls.isEmpty ? nil : toolCalls, nil)
 
         case .openaiLegacy, .azureOpenAI:
             let response = try JSONDecoder().decode(ChatCompletionResponse.self, from: data)
-            let content = response.choices.first?.message.content
-            let toolCalls = response.choices.first?.message.tool_calls
-            return (content, toolCalls)
+            let message = response.choices.first?.message
+            let content = message?.content
+            let toolCalls = message?.tool_calls
+            return (content, toolCalls, message?.reasoning_content)
 
         case .openResponses, .openAICodex:
             let response = try JSONDecoder().decode(OpenResponsesResponse.self, from: data)
@@ -1925,7 +1941,7 @@ public actor RemoteProviderService: ToolCapableService {
                 }
             }
 
-            return (textContent.isEmpty ? nil : textContent, toolCalls.isEmpty ? nil : toolCalls)
+            return (textContent.isEmpty ? nil : textContent, toolCalls.isEmpty ? nil : toolCalls, nil)
 
         case .gemini:
             let response = try JSONDecoder().decode(GeminiGenerateContentResponse.self, from: data)
@@ -1959,13 +1975,13 @@ public actor RemoteProviderService: ToolCapableService {
                 }
             }
 
-            return (textContent.isEmpty ? nil : textContent, toolCalls.isEmpty ? nil : toolCalls)
+            return (textContent.isEmpty ? nil : textContent, toolCalls.isEmpty ? nil : toolCalls, nil)
 
         case .osaurus:
             // Native Osaurus agent returns OpenAI-compatible responses
             let response = try JSONDecoder().decode(ChatCompletionResponse.self, from: data)
             let content = response.choices.first?.message.content
-            return (content, nil)
+            return (content, nil, nil)
         }
     }
 
diff --git a/Packages/OsaurusCore/Storage/ChatHistoryDatabase.swift b/Packages/OsaurusCore/Storage/ChatHistoryDatabase.swift
index 75fe037ca..1fd06c449 100644
--- a/Packages/OsaurusCore/Storage/ChatHistoryDatabase.swift
+++ b/Packages/OsaurusCore/Storage/ChatHistoryDatabase.swift
@@ -53,6 +53,13 @@ public final class ChatHistoryDatabase: @unchecked Sendable {
     // MARK: - Lifecycle
 
     public func open() throws {
+        #if DEBUG
+            if RuntimeEnvironment.isUnderTests, OsaurusPaths.overrideRoot == nil {
+                try openInMemory()
+                return
+            }
+        #endif
+
         // Defensive gate: production flow already awaits the
         // migrator in `AppDelegate.applicationDidFinishLaunching`,
         // but tests + future headless entry points may call
diff --git a/Packages/OsaurusCore/Tests/Chat/ChatViewSandboxTests.swift b/Packages/OsaurusCore/Tests/Chat/ChatViewSandboxTests.swift
index 9c85fc9b7..66be2e012 100644
--- a/Packages/OsaurusCore/Tests/Chat/ChatViewSandboxTests.swift
+++ b/Packages/OsaurusCore/Tests/Chat/ChatViewSandboxTests.swift
@@ -49,6 +49,8 @@ struct ChatViewSandboxTests {
         await SandboxTestLock.runWithStoragePaths {
             let manager = AgentManager.shared
             let originalActiveAgentId = manager.activeAgentId
+            ToolRegistry.shared.unregisterAllSandboxTools()
+
             let inactiveAgent = Agent(
                 name: "Chat Estimate Off",
                 agentAddress: "test-chat-estimate-off"
@@ -75,13 +77,10 @@ struct ChatViewSandboxTests {
             let inactiveBreakdown = inactiveSession.estimatedContextBreakdown
             let sandboxBreakdown = sandboxSession.estimatedContextBreakdown
 
-            let inactiveContextTokens = inactiveBreakdown.context.reduce(0) { $0 + $1.tokens }
-            let sandboxContextTokens = sandboxBreakdown.context.reduce(0) { $0 + $1.tokens }
-            #expect(sandboxContextTokens > inactiveContextTokens)
+            #expect(inactiveBreakdown.context.contains { $0.id == "sandbox" } == false)
+            #expect(sandboxBreakdown.context.contains { $0.id == "sandbox" })
 
             let sandboxToolTokens = sandboxBreakdown.context.first { $0.id == "tools" }?.tokens ?? 0
-            let inactiveToolTokens = inactiveBreakdown.context.first { $0.id == "tools" }?.tokens ?? 0
-            #expect(sandboxToolTokens > inactiveToolTokens)
             #expect(sandboxToolTokens >= ToolRegistry.shared.estimatedTokens(for: "sandbox_exec"))
 
             ToolRegistry.shared.unregisterAllSandboxTools()
diff --git a/Packages/OsaurusCore/Tests/Provider/RemoteChatRequestEncodingTests.swift b/Packages/OsaurusCore/Tests/Provider/RemoteChatRequestEncodingTests.swift
index 920ca63cc..2525f8b40 100644
--- a/Packages/OsaurusCore/Tests/Provider/RemoteChatRequestEncodingTests.swift
+++ b/Packages/OsaurusCore/Tests/Provider/RemoteChatRequestEncodingTests.swift
@@ -60,6 +60,42 @@ struct RemoteChatRequestEncodingTests {
         #expect(payload["max_completion_tokens"] == nil)
     }
 
+    @Test func encode_assistantToolCall_preservesReasoningContent() throws {
+        let call = ToolCall(
+            id: "call_123",
+            type: "function",
+            function: ToolCallFunction(name: "search", arguments: "{\"q\":\"DeepSeek\"}")
+        )
+        let assistant = ChatMessage(
+            role: "assistant",
+            content: nil,
+            tool_calls: [call],
+            tool_call_id: nil,
+            reasoning_content: "I should search first."
+        )
+        let request = RemoteChatRequest(
+            model: "deepseek-v4-pro",
+            messages: [assistant],
+            temperature: 0.7,
+            max_completion_tokens: 512,
+            stream: false,
+            top_p: nil,
+            frequency_penalty: nil,
+            presence_penalty: nil,
+            stop: nil,
+            tools: nil,
+            tool_choice: nil,
+            reasoning_effort: nil,
+            reasoning: nil,
+            modelOptions: [:],
+            veniceParameters: nil
+        )
+
+        let payload = try Self.encodeAsDictionary(request)
+        let messages = try #require(payload["messages"] as? [[String: Any]])
+        #expect(messages.first?["reasoning_content"] as? String == "I should search first.")
+    }
+
     @Test func openResponsesRequest_defaultSingleUserMessage_usesTextShorthand() throws {
         let request = Self.makeRequest(model: "gpt-5.2", maxTokens: 1024)
         let responsesRequest = request.toOpenResponsesRequest()
diff --git a/Packages/OsaurusCore/Tests/Tool/ToolSerializationStabilityTests.swift b/Packages/OsaurusCore/Tests/Tool/ToolSerializationStabilityTests.swift
index 550f1a4f7..8cbacf4e4 100644
--- a/Packages/OsaurusCore/Tests/Tool/ToolSerializationStabilityTests.swift
+++ b/Packages/OsaurusCore/Tests/Tool/ToolSerializationStabilityTests.swift
@@ -43,4 +43,32 @@ struct ToolSerializationStabilityTests {
         let bData = try JSONSerialization.data(withJSONObject: b, options: [.sortedKeys])
         #expect(aData == bData)
     }
+
+    @Test
+    func toTokenizerToolSpec_dropsNullsBeforeJinjaConversion() throws {
+        let tool = Tool(
+            type: "function",
+            function: ToolFunction(
+                name: "schema_probe",
+                description: nil,
+                parameters: .object([
+                    "type": .string("object"),
+                    "properties": .object([
+                        "value": .object([
+                            "type": .array([.string("string"), .null]),
+                            "description": .null,
+                        ])
+                    ]),
+                ])
+            )
+        )
+
+        let spec = tool.toTokenizerToolSpec()
+        let data = try JSONSerialization.data(withJSONObject: spec, options: [.sortedKeys])
+        let json = String(decoding: data, as: UTF8.self)
+
+        #expect(!json.contains("null"))
+        #expect(json.contains("\"type\":[\"string\"]"))
+        #expect(!json.contains("description"))
+    }
 }
diff --git a/Packages/OsaurusCore/Utils/OsaurusPaths.swift b/Packages/OsaurusCore/Utils/OsaurusPaths.swift
index e0edf8cac..07cda8b19 100644
--- a/Packages/OsaurusCore/Utils/OsaurusPaths.swift
+++ b/Packages/OsaurusCore/Utils/OsaurusPaths.swift
@@ -434,6 +434,40 @@ public enum OsaurusPaths {
         return total
     }
 
+    /// Finder-style free capacity for the volume containing `url`.
+    /// Modern APFS reports purgeable/important-usage capacity through URL
+    /// resource keys; legacy filesystem attributes can return zero for some
+    /// sandboxed container paths.
+    public static func volumeFreeBytes(containing url: URL) -> Int64? {
+        if let values = try? url.resourceValues(
+            forKeys: [.volumeAvailableCapacityForImportantUsageKey]
+        ),
+            let capacity = values.volumeAvailableCapacityForImportantUsage
+        {
+            return capacity
+        }
+        if let attrs = try? FileManager.default.attributesOfFileSystem(forPath: url.path),
+            let free = (attrs[.systemFreeSize] as? NSNumber)?.int64Value
+        {
+            return free
+        }
+        return nil
+    }
+
+    public static func volumeTotalBytes(containing url: URL) -> Int64? {
+        if let values = try? url.resourceValues(forKeys: [.volumeTotalCapacityKey]),
+            let capacity = values.volumeTotalCapacity
+        {
+            return Int64(capacity)
+        }
+        if let attrs = try? FileManager.default.attributesOfFileSystem(forPath: url.path),
+            let total = (attrs[.systemSize] as? NSNumber)?.int64Value
+        {
+            return total
+        }
+        return nil
+    }
+
     // MARK: - Migration
 
     /// Recursively copy the contents of `src` into `dest` (never deletes from `src`).
diff --git a/Packages/OsaurusCore/Views/Chat/ChatView.swift b/Packages/OsaurusCore/Views/Chat/ChatView.swift
index f408d8866..3be872a0f 100644
--- a/Packages/OsaurusCore/Views/Chat/ChatView.swift
+++ b/Packages/OsaurusCore/Views/Chat/ChatView.swift
@@ -1390,7 +1390,8 @@ final class ChatSession: ObservableObject {
                             role: "assistant",
                             content: content,
                             tool_calls: t.toolCalls,
-                            tool_call_id: nil
+                            tool_call_id: nil,
+                            reasoning_content: t.thinkingIsEmpty ? nil : t.thinking
                         )
                     case .tool:
                         return ChatMessage(
@@ -1594,6 +1595,9 @@ final class ChatSession: ObservableObject {
                             function: ToolCallFunction(name: inv.toolName, arguments: inv.jsonArguments),
                             geminiThoughtSignature: inv.geminiThoughtSignature
                         )
+                        if let reasoning = inv.reasoningContent {
+                            assistantTurn.thinking = reasoning
+                        }
                         assistantTurn.pendingToolName = nil
                         assistantTurn.clearPendingToolArgs()
                         if assistantTurn.toolCalls == nil { assistantTurn.toolCalls = [] }
diff --git a/Packages/OsaurusEvals/README.md b/Packages/OsaurusEvals/README.md
index 05d8d5e73..d9dd2cebc 100644
--- a/Packages/OsaurusEvals/README.md
+++ b/Packages/OsaurusEvals/README.md
@@ -1,6 +1,6 @@
 # OsaurusEvals
 
-Catalog-driven behaviour / integration tests for Osaurus that hit a real model (Foundation, MLX, remote provider).
+Catalog-driven behaviour / integration tests for Osaurus. Some suites hit a real model (Foundation, MLX, remote provider); pure-data suites pin helper contracts without model calls.
 
 These evals are deliberately **off the CI path**. They burn LLM tokens, depend on local plugin installs, and exist to help us tune capabilities and triage new models — not to gate every commit.
 
@@ -15,7 +15,7 @@ Packages/OsaurusEvals/
     OsaurusEvalsCLI/    — `osaurus-evals` executable
   Suites/
     Preflight/          — preflight pick + companion teaser cases
-    AgentLoop/          — placeholder for future agent-loop cases
+    AgentLoop/          — model-free todo / complete / clarify contract cases
     ...
 ```
 
@@ -84,7 +84,7 @@ Minimal example:
 Field reference:
 
 - `id` — unique slug; surfaced in reports for diffing across runs.
-- `domain` — selects the runner code path. Today only `preflight` is supported.
+- `domain` — selects the runner code path. Model-backed `preflight` cases run capability selection; pure-data domains such as `agent_loop`, `schema`, `tool_envelope`, `streaming_hint`, `prefix_hash`, `argument_coercion`, and `request_validation` avoid model calls.
 - `label` — optional human label; falls back to `id`.
 - `query` — the user message preflight runs against.
 - `fixtures.preflightMode` — `off` / `narrow` / `balanced` / `wide`. Default `balanced`.
@@ -106,6 +106,20 @@ A case with empty `expect: {}` is a valid smoke test — it records what preflig
 1. Add `Suites/<NewDomain>/` with a few JSON cases.
 2. In `Sources/OsaurusEvalsKit/EvalRunner.swift`, add a `case "<newdomain>":` arm to `runOne(...)`. Keep domain runners as separate top-level functions; merging them into one branch gets messy fast.
 
+## Agent loop smoke cases
+
+`Suites/AgentLoop/` is deliberately model-free. It pins the pure helper contracts used by the chat-layer agent loop intercepts:
+
+- `todoParse` checks markdown checklist parsing via `AgentTodo.parse`.
+- `completeValidate` checks placeholder/length validation via `CompleteTool.validate`.
+- `clarifyParse` checks option normalization and payload parsing via `ClarifyTool.parse`.
+
+Run it without a live server:
+
+```bash
+make evals EVALS_SUITE=Packages/OsaurusEvals/Suites/AgentLoop
+```
+
 ## CI isolation
 
 This package is a **separate Swift package**. CI / Xcode builds run `swift build` and `swift test` from `Packages/OsaurusCore`, never from here. Even if someone does `swift test` from inside `Packages/OsaurusEvals`, no test target exists yet — runner unit tests should be added with a `OSAURUS_EVALS_ENABLED=1` env-var gate so they never burn tokens unintentionally.
@@ -115,4 +129,4 @@ This package is a **separate Swift package**. CI / Xcode builds run `swift build
 - `osaurus-evals diff baseline.json current.json` — regression check against a stored baseline.
 - Per-model scoreboards under `reports/<model>/<date>.json`.
 - Auto-run on new model release (CI workflow listening for HF releases).
-- Domain growth: `Suites/AgentLoop/`, `Suites/ToolCalling/`, `Suites/SkillInjection/`.
+- Domain growth: `Suites/ToolCalling/`, `Suites/SkillInjection/`.
diff --git a/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalCase.swift b/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalCase.swift
index fb7baa717..e590e17d4 100644
--- a/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalCase.swift
+++ b/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalCase.swift
@@ -96,6 +96,7 @@ public struct EvalCase: Sendable, Codable, Identifiable {
         public let prefixHash: PrefixHashExpectations?
         public let argumentCoercion: ArgumentCoercionExpectations?
         public let requestValidation: RequestValidationExpectations?
+        public let agentLoop: AgentLoopExpectations?
 
         public init(
             tools: ToolExpectations? = nil,
@@ -105,7 +106,8 @@ public struct EvalCase: Sendable, Codable, Identifiable {
             streamingHint: StreamingHintExpectations? = nil,
             prefixHash: PrefixHashExpectations? = nil,
             argumentCoercion: ArgumentCoercionExpectations? = nil,
-            requestValidation: RequestValidationExpectations? = nil
+            requestValidation: RequestValidationExpectations? = nil,
+            agentLoop: AgentLoopExpectations? = nil
         ) {
             self.tools = tools
             self.companions = companions
@@ -115,6 +117,7 @@ public struct EvalCase: Sendable, Codable, Identifiable {
             self.prefixHash = prefixHash
             self.argumentCoercion = argumentCoercion
             self.requestValidation = requestValidation
+            self.agentLoop = agentLoop
         }
     }
 
@@ -307,6 +310,61 @@ public struct EvalCase: Sendable, Codable, Identifiable {
         }
     }
 
+    /// Expectation for `domain == "agent_loop"` cases. Keeps the
+    /// smoke suite model-free by driving the pure helpers behind the
+    /// chat agent-loop tools:
+    ///   - `todoParse`: markdown checklist -> `AgentTodo` counters.
+    ///   - `completeValidate`: summary -> accept/reject decision.
+    ///   - `clarifyParse`: JSON arguments -> `ClarifyPayload`.
+    public struct AgentLoopExpectations: Sendable, Codable {
+        public enum Operation: String, Sendable, Codable {
+            case todoParse
+            case completeValidate
+            case clarifyParse
+        }
+
+        public let op: Operation
+        public let markdown: String?
+        public let argumentsJSON: String?
+        public let summary: String?
+        public let expectTotal: Int?
+        public let expectDone: Int?
+        public let expectItems: [String]?
+        public let expectAccept: Bool?
+        public let expectReasonContains: String?
+        public let expectQuestion: String?
+        public let expectOptions: [String]?
+        public let expectAllowMultiple: Bool?
+
+        public init(
+            op: Operation,
+            markdown: String? = nil,
+            argumentsJSON: String? = nil,
+            summary: String? = nil,
+            expectTotal: Int? = nil,
+            expectDone: Int? = nil,
+            expectItems: [String]? = nil,
+            expectAccept: Bool? = nil,
+            expectReasonContains: String? = nil,
+            expectQuestion: String? = nil,
+            expectOptions: [String]? = nil,
+            expectAllowMultiple: Bool? = nil
+        ) {
+            self.op = op
+            self.markdown = markdown
+            self.argumentsJSON = argumentsJSON
+            self.summary = summary
+            self.expectTotal = expectTotal
+            self.expectDone = expectDone
+            self.expectItems = expectItems
+            self.expectAccept = expectAccept
+            self.expectReasonContains = expectReasonContains
+            self.expectQuestion = expectQuestion
+            self.expectOptions = expectOptions
+            self.expectAllowMultiple = expectAllowMultiple
+        }
+    }
+
     public struct ToolExpectations: Sendable, Codable {
         /// Tool names that MUST appear in the picked set. Each missing
         /// name costs a fixed weight (see `Scorers.scoreTools`).
diff --git a/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalRunner.swift b/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalRunner.swift
index 124fa542c..f9ff0f2c7 100644
--- a/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalRunner.swift
+++ b/Packages/OsaurusEvals/Sources/OsaurusEvalsKit/EvalRunner.swift
@@ -27,16 +27,14 @@ public enum EvalRunner {
         model: ModelSelection,
         filter: String? = nil
     ) async -> EvalReport {
-        // The CLI is its own process — it has to scan + dlopen every
-        // installed plugin manually before preflight can see plugin
-        // tools (the host app does this in AppDelegate). Without it
-        // every `requirePlugins` case skips with "missing plugins" no
-        // matter what's actually installed on disk.
-        await PreflightEvaluator.loadInstalledPlugins()
-
         let modelLabel = ModelOverride.describe(model)
         let startedAt = isoNow()
         var rows: [EvalCaseReport] = []
+        let runnableCases = suite.cases.filter { testCase in
+            guard let filter else { return true }
+            return testCase.id.contains(filter)
+        }
+        let needsPreflightRuntime = runnableCases.contains { $0.domain == "preflight" }
 
         // Surface decode failures up-front as `errored` rows so a
         // contributor with a typo sees the file name in the report
@@ -54,9 +52,22 @@ public enum EvalRunner {
             )
         }
 
-        await ModelOverride.withSelection(model) {
-            for testCase in suite.cases {
-                if let filter, !testCase.id.contains(filter) { continue }
+        if needsPreflightRuntime {
+            // The CLI is its own process — it has to scan + dlopen every
+            // installed plugin manually before preflight can see plugin
+            // tools (the host app does this in AppDelegate). Without it
+            // every `requirePlugins` case skips with "missing plugins" no
+            // matter what's actually installed on disk.
+            await PreflightEvaluator.loadInstalledPlugins()
+
+            await ModelOverride.withSelection(model) {
+                for testCase in runnableCases {
+                    let row = await runOne(testCase, modelId: modelLabel)
+                    rows.append(row)
+                }
+            }
+        } else {
+            for testCase in runnableCases {
                 let row = await runOne(testCase, modelId: modelLabel)
                 rows.append(row)
             }
@@ -85,6 +96,8 @@ public enum EvalRunner {
             return runArgumentCoercionCase(testCase, modelId: modelId)
         case "request_validation":
             return runRequestValidationCase(testCase, modelId: modelId)
+        case "agent_loop":
+            return runAgentLoopCase(testCase, modelId: modelId)
         case "tools", "streaming", "contract":
             // Scaffolded domains — runner implementation lives in a
             // follow-up so cases can be authored against the format
@@ -602,6 +615,159 @@ public enum EvalRunner {
         )
     }
 
+    // MARK: - Agent loop domain
+
+    /// Pure-data evaluator for `domain == "agent_loop"`. These cases
+    /// avoid model calls and pin the helper contracts that the chat UI
+    /// relies on when intercepting `todo`, `complete`, and `clarify`.
+    private static func runAgentLoopCase(_ testCase: EvalCase, modelId: String) -> EvalCaseReport {
+        let label = testCase.label ?? testCase.id
+        guard let exp = testCase.expect.agentLoop else {
+            return Self.errored(testCase, label: label, modelId: modelId, note: "missing `expect.agentLoop`")
+        }
+
+        switch exp.op {
+        case .todoParse:
+            return runAgentLoopTodoParse(testCase, label: label, modelId: modelId, exp: exp)
+        case .completeValidate:
+            return runAgentLoopCompleteValidate(testCase, label: label, modelId: modelId, exp: exp)
+        case .clarifyParse:
+            return runAgentLoopClarifyParse(testCase, label: label, modelId: modelId, exp: exp)
+        }
+    }
+
+    private static func runAgentLoopTodoParse(
+        _ testCase: EvalCase,
+        label: String,
+        modelId: String,
+        exp: EvalCase.AgentLoopExpectations
+    ) -> EvalCaseReport {
+        guard let markdown = exp.markdown else {
+            return Self.errored(testCase, label: label, modelId: modelId, note: "todoParse needs `markdown`")
+        }
+
+        let todo = AgentTodo.parse(markdown)
+        var notes: [String] = []
+        var passed = true
+
+        if let total = exp.expectTotal, todo.totalCount != total {
+            passed = false
+            notes.append("total mismatch: expected \(total), got \(todo.totalCount)")
+        }
+        if let done = exp.expectDone, todo.doneCount != done {
+            passed = false
+            notes.append("done mismatch: expected \(done), got \(todo.doneCount)")
+        }
+        if let expectedItems = exp.expectItems {
+            let actualItems = todo.items.map(\.text)
+            if actualItems != expectedItems {
+                passed = false
+                notes.append("items mismatch: expected \(expectedItems), got \(actualItems)")
+            }
+        }
+        if notes.isEmpty {
+            notes.append("parsed \(todo.doneCount)/\(todo.totalCount) complete")
+        }
+
+        return .terminal(
+            id: testCase.id,
+            label: label,
+            domain: testCase.domain,
+            outcome: passed ? .passed : .failed,
+            notes: notes,
+            modelId: modelId
+        )
+    }
+
+    private static func runAgentLoopCompleteValidate(
+        _ testCase: EvalCase,
+        label: String,
+        modelId: String,
+        exp: EvalCase.AgentLoopExpectations
+    ) -> EvalCaseReport {
+        guard let summary = exp.summary else {
+            return Self.errored(testCase, label: label, modelId: modelId, note: "completeValidate needs `summary`")
+        }
+
+        let reason = CompleteTool.validate(summary: summary)
+        let accepted = (reason == nil)
+        let expectedAccept = exp.expectAccept ?? true
+        var notes: [String] = []
+        var passed = (accepted == expectedAccept)
+
+        if accepted {
+            notes.append("accepted")
+        } else {
+            notes.append("rejected: \(reason ?? "(unknown)")")
+        }
+
+        if let needle = exp.expectReasonContains {
+            if let reason, reason.contains(needle) {
+                notes.append("reason contains '\(needle)'")
+            } else {
+                passed = false
+                notes.append("expected rejection reason to contain '\(needle)'")
+            }
+        }
+
+        return .terminal(
+            id: testCase.id,
+            label: label,
+            domain: testCase.domain,
+            outcome: passed ? .passed : .failed,
+            notes: notes,
+            modelId: modelId
+        )
+    }
+
+    private static func runAgentLoopClarifyParse(
+        _ testCase: EvalCase,
+        label: String,
+        modelId: String,
+        exp: EvalCase.AgentLoopExpectations
+    ) -> EvalCaseReport {
+        guard let argumentsJSON = exp.argumentsJSON else {
+            return Self.errored(testCase, label: label, modelId: modelId, note: "clarifyParse needs `argumentsJSON`")
+        }
+        guard let payload = ClarifyTool.parse(argumentsJSON: argumentsJSON) else {
+            return .terminal(
+                id: testCase.id,
+                label: label,
+                domain: testCase.domain,
+                outcome: .failed,
+                notes: ["ClarifyTool.parse returned nil"],
+                modelId: modelId
+            )
+        }
+
+        var notes: [String] = []
+        var passed = true
+        if let expected = exp.expectQuestion, payload.question != expected {
+            passed = false
+            notes.append("question mismatch: expected '\(expected)', got '\(payload.question)'")
+        }
+        if let expected = exp.expectOptions, payload.options != expected {
+            passed = false
+            notes.append("options mismatch: expected \(expected), got \(payload.options)")
+        }
+        if let expected = exp.expectAllowMultiple, payload.allowMultiple != expected {
+            passed = false
+            notes.append("allowMultiple mismatch: expected \(expected), got \(payload.allowMultiple)")
+        }
+        if notes.isEmpty {
+            notes.append("clarify payload parsed")
+        }
+
+        return .terminal(
+            id: testCase.id,
+            label: label,
+            domain: testCase.domain,
+            outcome: passed ? .passed : .failed,
+            notes: notes,
+            modelId: modelId
+        )
+    }
+
     // MARK: - Helpers
 
     private static func isoNow() -> String {
diff --git a/Packages/OsaurusEvals/Suites/AgentLoop/clarify-options-dedupe.json b/Packages/OsaurusEvals/Suites/AgentLoop/clarify-options-dedupe.json
new file mode 100644
index 000000000..3a4847f86
--- /dev/null
+++ b/Packages/OsaurusEvals/Suites/AgentLoop/clarify-options-dedupe.json
@@ -0,0 +1,16 @@
+{
+  "id": "agent_loop.clarify.options-dedupe",
+  "domain": "agent_loop",
+  "label": "agent loop • clarify option normalization",
+  "query": "parse a clarify call with duplicate options",
+  "fixtures": {},
+  "expect": {
+    "agentLoop": {
+      "op": "clarifyParse",
+      "argumentsJSON": "{\"question\":\"Use Postgres or SQLite?\",\"options\":[\" Postgres \",\"SQLite\",\"sqlite\",\"\"],\"allowMultiple\":true}",
+      "expectQuestion": "Use Postgres or SQLite?",
+      "expectOptions": ["Postgres", "SQLite"],
+      "expectAllowMultiple": true
+    }
+  }
+}
diff --git a/Packages/OsaurusEvals/Suites/AgentLoop/complete-accepts-verified-summary.json b/Packages/OsaurusEvals/Suites/AgentLoop/complete-accepts-verified-summary.json
new file mode 100644
index 000000000..175f4cdd4
--- /dev/null
+++ b/Packages/OsaurusEvals/Suites/AgentLoop/complete-accepts-verified-summary.json
@@ -0,0 +1,14 @@
+{
+  "id": "agent_loop.complete.accepts-verified-summary",
+  "domain": "agent_loop",
+  "label": "agent loop • complete accepts verified summary",
+  "query": "validate a useful completion summary",
+  "fixtures": {},
+  "expect": {
+    "agentLoop": {
+      "op": "completeValidate",
+      "summary": "Added the compatibility report target and verified it with bash -n plus shellcheck.",
+      "expectAccept": true
+    }
+  }
+}
diff --git a/Packages/OsaurusEvals/Suites/AgentLoop/complete-rejects-placeholder.json b/Packages/OsaurusEvals/Suites/AgentLoop/complete-rejects-placeholder.json
new file mode 100644
index 000000000..1ab364d33
--- /dev/null
+++ b/Packages/OsaurusEvals/Suites/AgentLoop/complete-rejects-placeholder.json
@@ -0,0 +1,15 @@
+{
+  "id": "agent_loop.complete.rejects-placeholder",
+  "domain": "agent_loop",
+  "label": "agent loop • complete rejects placeholder",
+  "query": "reject placeholder completion summary",
+  "fixtures": {},
+  "expect": {
+    "agentLoop": {
+      "op": "completeValidate",
+      "summary": "done",
+      "expectAccept": false,
+      "expectReasonContains": "too short"
+    }
+  }
+}
diff --git a/Packages/OsaurusEvals/Suites/AgentLoop/todo-parse-basic.json b/Packages/OsaurusEvals/Suites/AgentLoop/todo-parse-basic.json
new file mode 100644
index 000000000..023d6a19d
--- /dev/null
+++ b/Packages/OsaurusEvals/Suites/AgentLoop/todo-parse-basic.json
@@ -0,0 +1,16 @@
+{
+  "id": "agent_loop.todo.parse-basic",
+  "domain": "agent_loop",
+  "label": "agent loop • todo parses checklist",
+  "query": "parse a todo checklist",
+  "fixtures": {},
+  "expect": {
+    "agentLoop": {
+      "op": "todoParse",
+      "markdown": "Plan:\n- [x] Read the docs\n- [ ] Add coverage\n  - [ ] Run the suite\nnot a task",
+      "expectTotal": 3,
+      "expectDone": 1,
+      "expectItems": ["Read the docs", "Add coverage", "Run the suite"]
+    }
+  }
+}
diff --git a/README.md b/README.md
index 6a19fb33d..a79814b51 100644
--- a/README.md
+++ b/README.md
@@ -222,7 +222,7 @@ cd osaurus
 open osaurus.xcworkspace
 ```
 
-Build and run the `osaurus` target. Requires Xcode 16+ and macOS 15.5+.
+Build and run the `osaurus` target. Requires macOS 15.5+ and a Swift 6.2-capable Xcode toolchain. CI currently pins Xcode 26.4.1.
 
 ### Git Hooks (lefthook)
 
@@ -267,7 +267,7 @@ See [CONTRIBUTING.md](docs/CONTRIBUTING.md) for the architecture guide and layer
 
 Osaurus is actively developed and we welcome contributions: bug fixes, new plugins, documentation, UI/UX improvements, and testing.
 
-Check out [Good First Issues](https://github.com/osaurus-ai/osaurus/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22), read the [Contributing Guide](CONTRIBUTING.md), or join [Discord](https://discord.gg/osaurus). See [docs/FEATURES.md](docs/FEATURES.md) for the full feature inventory.
+Check out [Good First Issues](https://github.com/osaurus-ai/osaurus/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22), read the [Contributing Guide](docs/CONTRIBUTING.md), or join [Discord](https://discord.gg/osaurus). See [docs/FEATURES.md](docs/FEATURES.md) for the full feature inventory and [docs/DEVELOPMENT_PLAN.md](docs/DEVELOPMENT_PLAN.md) for the forward roadmap.
 
 ## Community
 
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index 6ce7ff42d..70255e839 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -18,16 +18,16 @@ Requirements:
 
 - macOS 15.5+
 - Apple Silicon (M1 or newer)
-- Xcode 16.4+
+- A Swift 6.2-capable Xcode toolchain. CI currently pins Xcode 26.4.1.
 
 Build and run:
 
-1. Open `osaurus.xcworkspace` in Xcode 16.4+
+1. Open `osaurus.xcworkspace` in Xcode with the same Swift toolchain family used by CI
 2. Select the `osaurus` target and press Run
 3. In the app UI, choose a port (default `1337`), then Start
 4. Download a model from the Model Manager to generate text locally
 
-Project layout and API overview are in `README.md`. For a complete feature inventory, see [FEATURES.md](FEATURES.md).
+Project layout and API overview are in `README.md`. For a complete feature inventory, see [FEATURES.md](FEATURES.md). For prioritized roadmap work, see [DEVELOPMENT_PLAN.md](DEVELOPMENT_PLAN.md).
 
 ## Architecture guide
 
@@ -112,6 +112,7 @@ The core library (`Packages/OsaurusCore/`) follows a layered architecture. Each
 - Write clear, focused commits; prefer Conventional Commits where practical
 - Open a pull request early for feedback if helpful
 - Keep PRs small and focused; describe user-facing changes and test steps
+- Use [DEVELOPMENT_PLAN.md](DEVELOPMENT_PLAN.md) to choose priority when a change spans multiple workstreams
 
 ### Code style
 
@@ -139,7 +140,34 @@ gitignored and not used by CI.
 ### Testing
 
 - Add or update tests in `Packages/OsaurusCore/Tests/` where reasonable
-- Ensure the project builds and tests pass in Xcode before submitting
+- Run focused tests for the package you changed before submitting
+- Use `make ci-test` when you need local parity with the CI `test-core` job
+- Keep model, sandbox, network, and other external-infrastructure tests opt-in through environment variables
+
+Recommended local checks:
+
+| Change type | Command |
+| ----------- | ------- |
+| Formatting | `swift-format lint --strict --recursive Packages App` |
+| Core logic | `swift test --package-path Packages/OsaurusCore` |
+| CI parity for core tests | `make ci-test` |
+| CLI changes | `swift test --package-path Packages/OsaurusCLI --parallel` |
+| Plugin repository changes | `swift test --package-path Packages/OsaurusRepository` |
+| Behavior/eval tuning | `make evals` or `make evals-report` |
+| Shell scripts | `find scripts -name '*.sh' -print0 \| xargs -0 shellcheck --severity=warning` |
+
+`Packages/OsaurusEvals` is intentionally off the normal CI path because it can burn model tokens and depend on local setup. Add eval cases for behavior that depends on model or provider output, but do not make them unconditional CI gates without an explicit maintainer decision.
+
+### Definition of done
+
+A contribution is ready for review when:
+
+- The change follows the layer rules above
+- Tests or evals cover the behavior, or the PR explains why coverage is not reasonable
+- Docs, fixtures, and examples are updated for public API, tool, storage, plugin, or file format changes
+- Security-sensitive changes include redaction, permission, and user-visible failure-mode thinking
+- UI changes include screenshots or recordings when visual behavior changes
+- The PR test plan lists the exact local commands or manual checks performed
 
 ### Commit and PR guidelines
 
@@ -160,6 +188,7 @@ Good documentation is just as important as good code. Here's how to contribute t
 | -------------------------------------------------------------- | ----------------------------------------------------------------- |
 | [README.md](../README.md)                                      | Project overview, quick start, feature highlights                 |
 | [FEATURES.md](FEATURES.md)                                     | **Source of truth** — feature inventory and architecture          |
+| [DEVELOPMENT_PLAN.md](DEVELOPMENT_PLAN.md)                     | Prioritized roadmap, workstreams, and definition of done          |
 | [REMOTE_PROVIDERS.md](REMOTE_PROVIDERS.md)                     | Remote provider setup and configuration                           |
 | [REMOTE_MCP_PROVIDERS.md](REMOTE_MCP_PROVIDERS.md)             | Remote MCP provider setup                                         |
 | [DEVELOPER_TOOLS.md](DEVELOPER_TOOLS.md)                       | Insights and Server Explorer guide                                |
diff --git a/docs/DEVELOPER_TOOLS.md b/docs/DEVELOPER_TOOLS.md
index 471855869..4726d247b 100644
--- a/docs/DEVELOPER_TOOLS.md
+++ b/docs/DEVELOPER_TOOLS.md
@@ -255,9 +255,20 @@ The Server Explorer requires the server to be running. If endpoints show as disa
 
 How CI runs the Osaurus test suite, and the hooks that exist to debug it when it goes sideways.
 
+### Jobs
+
+The CI workflow is pinned to the runner and Xcode version declared in [`.github/workflows/ci.yml`](../.github/workflows/ci.yml).
+
+| Job | Purpose | Current Timeout |
+| --- | --- | --- |
+| `test-core` | `xcodebuild test` for `OsaurusCoreTests` through `osaurus.xcworkspace` | 45 minutes |
+| `test-cli` | `swift test --package-path Packages/OsaurusCLI --parallel` | 10 minutes |
+| `swiftlint` | SwiftLint over the repo | 10 minutes |
+| `shellcheck` | ShellCheck for scripts | 10 minutes |
+
 ### Reproduce CI locally
 
-The Makefile target `make ci-test` runs the exact `xcodebuild` flags CI uses, piped through `xcbeautify`, and writes a result bundle:
+The Makefile target `make ci-test` runs the same core `xcodebuild` path CI uses, pipes output through `xcbeautify`, and writes a result bundle:
 
 ```bash
 brew install xcbeautify    # one-time
@@ -265,62 +276,58 @@ make ci-test
 open build/Tests.xcresult  # full Xcode Test Navigator UI
 ```
 
-If a test fails on CI but you can't reproduce it on your machine, download the `test-core-xcresult-*` artifact attached to the failed CI run and open it the same way.
+Use narrower package tests while iterating, then use `make ci-test` before a risky PR or when chasing a CI-only failure.
 
 ### Long-running and integration tests
 
-Tests that require external infrastructure (Apple Containerization, real GPU, network, etc.) must:
+Tests that require external infrastructure (Apple Containerization, real GPU, network, model downloads, provider credentials, etc.) must:
 
-1. **Be opt-in via an environment variable** — never run unconditionally in CI.
-2. **Use Swift Testing's `.disabled(if:)` trait** at the suite level so they're reported as `Disabled` (not silently passing). Pattern:
+1. **Be opt-in via an environment variable** - never run unconditionally in CI.
+2. **Use Swift Testing's `.disabled(if:)` trait** at the suite level so they are reported as `Disabled` rather than silently passing. Pattern:
 
    ```swift
    private let isEnabled =
        ProcessInfo.processInfo.environment["OSAURUS_RUN_FOO_TESTS"] == "1"
 
    @Suite(.disabled(if: !isEnabled, "Set OSAURUS_RUN_FOO_TESTS=1 to run"))
-   struct FooIntegrationTests { … }
+   struct FooIntegrationTests { ... }
    ```
 
-3. **Keep individual test bodies under ~250ms of `Task.sleep`** and prefer event-driven waits (continuations, `AsyncStream`) for everything else.
+3. **Keep individual test bodies under ~250ms of `Task.sleep`** and prefer event-driven waits such as continuations or `AsyncStream`.
 
 Currently env-gated:
 
-| Env var                                  | Suite                                                                                    | Notes                                            |
-| ---------------------------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------ |
-| `OSAURUS_RUN_SANDBOX_INTEGRATION_TESTS=1` | [`SandboxIntegrationTests`](../Packages/OsaurusCore/Tests/Sandbox/SandboxIntegrationTests.swift) | Boots a Linux VM; runs `pip`/`npm`/`go` workloads. |
+| Env var | Suite | Notes |
+| --- | --- | --- |
+| `OSAURUS_RUN_SANDBOX_INTEGRATION_TESTS=1` | [`SandboxIntegrationTests`](../Packages/OsaurusCore/Tests/Sandbox/SandboxIntegrationTests.swift) | Boots a Linux VM and runs package-manager workloads. |
 
 ### CI cache controls
 
-The `test-core` job caches `~/Library/Developer/Xcode/DerivedData` keyed on Swift sources, manifests, resources, the pinned Xcode version, and a manual `CACHE_SALT`. Two recovery levers when you suspect a bad cache:
+The `test-core` job caches SPM packages and `~/Library/Developer/Xcode/DerivedData`. DerivedData is keyed on Swift sources, manifests, resources, C headers/sources, the pinned Xcode version, and `CACHE_SALT`.
+
+Two recovery levers exist when you suspect a bad cache:
 
-1. **One-shot cold build**: trigger CI manually via the **Run workflow** button on the [CI workflow](../.github/workflows/ci.yml) page and check `clear_cache`. Skips the restore for that one run.
-2. **Permanent bust**: bump `CACHE_SALT` (currently `v1`) at the top of `.github/workflows/ci.yml` to `v2` and merge. Every cache key invalidates immediately.
+1. **One-shot cold build**: trigger CI manually via the **Run workflow** button and check `clear_cache`. CI still restores the cache first so the save key is available, then wipes restored DerivedData before building. The SPM source cache is preserved.
+2. **Permanent bust**: bump `CACHE_SALT` at the top of `.github/workflows/ci.yml` and merge. Every DerivedData and SPM cache key invalidates immediately.
 
-The cache only **saves** on `main` pushes — PRs read from it but never overwrite, so a half-baked branch can't poison everyone.
+DerivedData cache saves only on successful `main` runs. PRs can read caches but cannot overwrite them.
 
 ### Where the logs live
 
-The full xcodebuild output is collapsed into expandable groups by `xcbeautify`. On a failure CI also publishes:
+The full `xcodebuild` output is grouped by `xcbeautify`. On failure or cancellation CI also publishes:
 
-- A short failure summary (failed tests + assertion messages) at the top of the GitHub Actions run page.
-- The raw `Tests.xcresult` bundle as a downloadable artifact (`test-core-xcresult-N`, 7 days retention).
+- A GitHub step summary that distinguishes build failure, launch hang, zero-test-result hang, and ordinary failed test cases.
+- The raw `Tests.xcresult` bundle as a downloadable artifact named `test-core-xcresult-N`, retained for 7 days.
 
-A passing run produces ~1–2k log lines instead of the historical ~30k, and individual tests that hang are killed in ~2 min by `-test-timeouts-enabled YES` (default 60s, max 120s per test). The whole `test-core` job is also capped at 15 minutes via `timeout-minutes`.
+Per-test timeouts are enabled with a 60-second default allowance and 120-second maximum allowance. This surfaces hung test names before the job wall-timeout whenever the test bundle launches far enough to report them.
 
 ### Deferred follow-up
 
-Test wall-time is now bounded by the build-from-scratch cost of the full `OsaurusCore` package. The biggest remaining lever is splitting `OsaurusCore` into focused SPM targets (`OsaurusFoundation`, `OsaurusInference`, `OsaurusVoice`, `OsaurusUpdater`, `OsaurusSandbox`, `OsaurusUI`) so a Foundation-only PR doesn't rebuild MLX / FluidAudio / Sparkle / VecturaKit. File-coupling counts that justify the split:
+Test wall-time is bounded by the build-from-scratch cost of the full `OsaurusCore` package. The biggest remaining lever is splitting `OsaurusCore` into focused targets so a foundation-only PR does not rebuild MLX, FluidAudio, Sparkle, VecturaKit, Containerization, SQLCipher, and SwiftUI-adjacent code.
 
-- MLX/MLXLLM/MLXVLM/MLXLMCommon/Tokenizers: ~10 files, all in `Services/ModelRuntime*`, `Managers/Model/ModelManager.swift`, `Models/Configuration/VLMDetection.swift`, `Utils/StreamingDeltaProcessor.swift`, `Views/Chat/ChatView.swift`.
-- `FluidAudio`: 2 files (`Managers/SpeechService.swift`, `Managers/Model/SpeechModelManager.swift`).
-- `Sparkle`: 1 file (`Services/UpdaterService.swift`).
-- `AAInfographics`: 1 file (`Views/Chat/NativeChartView.swift`).
-- `VecturaKit`: 7 files in `Services/{Memory,Method,Skill,Tool}/*`.
-- `Containerization`: 1 file (`Services/Sandbox/SandboxManager.swift`).
-- `P256K`, `Highlightr`, `SwiftMath`: 1 file each.
+The first split should isolate pure models, schemas, utility code, and low-dependency tests. One known boundary leak to clean before that split: `Models/Configuration/VLMDetection.swift` imports `MLXVLM` from the otherwise pure `Models/` tree.
 
-Yet **64 of 70 test files use `@testable import OsaurusCore`**, so even tiny tests rebuild the heavy graph today. The one boundary leak that needs cleaning before the split: `Models/Configuration/VLMDetection.swift` imports `MLXVLM` from the otherwise-pure `Models/` tree.
+See [DEVELOPMENT_PLAN.md](DEVELOPMENT_PLAN.md) for the prioritized architecture workstream.
 
 ---
 
diff --git a/docs/DEVELOPMENT_PLAN.md b/docs/DEVELOPMENT_PLAN.md
new file mode 100644
index 000000000..9ce290644
--- /dev/null
+++ b/docs/DEVELOPMENT_PLAN.md
@@ -0,0 +1,229 @@
+# Osaurus Development Plan
+
+Updated: 2026-04-30
+
+This plan turns the current repository state, public documentation, private planning notes, CI workflow, and contribution guidelines into a prioritized development roadmap. It is intentionally practical: work is grouped by risk, sequence, and the tests or documentation needed before it can be called done.
+
+## North Star
+
+Osaurus should be the local-first AI harness for macOS: agents, memory, tools, identity, voice, automation, and model access that remain useful across local and cloud providers while keeping user data under user control.
+
+Near-term development should favor reliability, compatibility, contributor speed, and trustworthy extension points before expanding the feature surface.
+
+## Current Assessment
+
+The repo is already feature-rich:
+
+- Core product: agents, memory, chat sessions, local MLX inference, remote providers, OpenAI/Anthropic/Ollama/Open Responses-compatible endpoints, MCP server/client support, schedules, watchers, voice input, storage encryption, sandbox execution, skills, methods, and plugins.
+- Architecture: `OsaurusCore` follows a clear Models / Services / Managers / Views / Networking / Storage / Tools / Identity split, with a large SwiftUI surface and heavy runtime dependencies.
+- Test posture: `OsaurusCore` has broad unit and integration coverage, CLI tests run separately, behavior evals live in `Packages/OsaurusEvals`, and CI gates core tests, CLI tests, SwiftLint, and shell script linting.
+- Main development pressure: `OsaurusCore` is large and dependency-heavy, so small changes can pay the cost of MLX, FluidAudio, SQLCipher, VecturaKit, Sparkle, Containerization, and UI dependencies.
+- Product pressure: the public docs present many features as stable, so the next releases need stronger compatibility suites, fewer edge-case regressions, and clearer completion criteria.
+- Private planning pressure: high-fidelity document I/O is valuable, but it should follow shared foundations, fixture-based verification, and render checks rather than landing as a broad one-shot feature.
+
+## Priority Framework
+
+Use this order when choosing what to do next:
+
+| Priority | Meaning | Default Action |
+| --- | --- | --- |
+| P0 | Blocks safe release or contributor trust | Fix before feature expansion |
+| P1 | Improves reliability, compatibility, or development speed | Schedule in the next 1-2 milestones |
+| P2 | Expands core product value on proven foundations | Start after P0/P1 risk is bounded |
+| P3 | Ecosystem, polish, and growth work | Keep moving, but do not preempt P0/P1 |
+
+## Phase 0: Documentation And Contributor Contract
+
+Target: immediate
+
+Goal: make the repo's written contract match how the repo actually builds, tests, and accepts changes.
+
+Deliverables:
+
+- Keep `docs/CONTRIBUTING.md`, `docs/DEVELOPER_TOOLS.md`, the PR template, and private development notes aligned with CI.
+- Make `docs/DEVELOPMENT_PLAN.md` the public roadmap and link it from the documentation index.
+- Keep private feature plans scoped to implementation details, not competing project direction.
+- Add a consistent Definition of Done for code, docs, tests, security, and compatibility changes.
+- Maintain a concise local verification matrix for core, CLI, evals, formatting, and env-gated integration suites.
+
+Acceptance criteria:
+
+- A new contributor can identify the right build/test command without reading CI YAML first.
+- Docs do not reference stale cache salts, stale timeouts, wrong paths, or missing root files.
+- PR template checklist matches `docs/CONTRIBUTING.md`.
+
+## Phase 1: Release Hardening And Compatibility
+
+Target: weeks 1-4
+
+Goal: protect the existing surface area before expanding it.
+
+P0/P1 work:
+
+| ID | Priority | Work | Deliverables | Acceptance Criteria |
+| --- | --- | --- | --- | --- |
+| R1 | P0 | API compatibility guardrail | Scripted streaming/non-streaming checks for OpenAI Chat Completions, Open Responses, Anthropic Messages, Ollama chat, tool calls, and error envelopes | Results are reproducible locally and artifacts land under `results/` or `build/compat/` |
+| R2 | P0 | Remote provider request parity | Golden request encoding tests for OpenAI-compatible, Anthropic, Open Responses, Ollama, and custom providers | Provider changes require fixture updates and test approval |
+| R3 | P0 | Local runtime cancellation and cache safety | Tests around model lease lifetime, cancelled streams, disk cache restore, reasoning sentinel handling, and local/remote model switches | No known crash class can regress without a focused test failing |
+| R4 | P0 | Storage and recovery clarity | Verify encrypted DB migration, plaintext backup, key rotation, vector-index rebuild, and mismatch UX | Storage docs and tests cover recovery and failure cases |
+| R5 | P1 | CI stability dashboard | Document recurring CI failure modes and keep artifact summaries actionable | Failed CI runs identify build failure, launch hang, test hang, or assertion failure quickly |
+| R6 | P1 | Accessibility enforcement | Add theme contrast warnings and at least one high-contrast preset path | Theme editor surfaces contrast risk before export |
+
+Recommended sequence:
+
+1. Stabilize request/response compatibility first, because API behavior is the integration contract.
+2. Harden local runtime and storage next, because crashes or unrecoverable data loss are higher risk than UI polish.
+3. Add accessibility guardrails before broad theme or onboarding iteration.
+
+## Phase 2: Developer Velocity And Architecture Split
+
+Target: weeks 4-8
+
+Goal: reduce build/test drag and make ownership boundaries easier to preserve.
+
+P1 work:
+
+| ID | Priority | Work | Deliverables | Acceptance Criteria |
+| --- | --- | --- | --- | --- |
+| A1 | P1 | Split pure foundations | Extract low-dependency models, utilities, schemas, and protocol types into a lightweight package/target | Foundation-only tests do not import MLX, FluidAudio, Sparkle, Containerization, or SwiftUI |
+| A2 | P1 | Fix boundary leaks | Move `VLMDetection` or isolate MLX/VLM imports out of otherwise pure model code | Pure targets compile without MLX/VLM products |
+| A3 | P1 | Targeted test buckets | Group tests by dependency profile: foundation, networking, storage, inference, UI-adjacent, sandbox | CI can run fast buckets without rebuilding the full heavy graph for every change |
+| A4 | P1 | Fixture discipline | Create stable fixture directories for API, storage migration, document parsing, plugins, and evals | New regression tests reuse fixtures instead of inventing ad hoc setup |
+| A5 | P1 | Contributor labels and issue templates | Align issue labels with roadmap workstreams and "good first issue" scope | New contributors can find safe starter work without deep architecture context |
+
+Notes:
+
+- Keep `OsaurusCore` behavior unchanged during the split; treat this as build-system and dependency-risk reduction first.
+- Start with pure code and tests. Do not split UI until the lower-level boundary is stable.
+
+## Phase 3: Agent Capability Quality
+
+Target: weeks 6-12
+
+Goal: improve agent behavior with measurable evals and tighter tool contracts.
+
+P1/P2 work:
+
+| ID | Priority | Work | Deliverables | Acceptance Criteria |
+| --- | --- | --- | --- | --- |
+| G1 | P1 | Expand OsaurusEvals | Add suites for agent loop, tool calling, skill injection, method recall, and memory retrieval | Each suite has representative cases and machine-readable reports |
+| G2 | P1 | Preflight selection tuning | Track selected tools/skills/methods, false positives, missed matches, and token overhead | Changes to preflight behavior can be compared across models |
+| G3 | P1 | Tool error taxonomy | Normalize retryable, permission, validation, timeout, and provider errors across built-in, MCP, sandbox, and plugin tools | Agents receive actionable errors; UI shows user-safe summaries |
+| G4 | P2 | Method lifecycle | Improve method creation, scoring, review, and retirement flows | Low-quality or stale methods decay without manual cleanup |
+| G5 | P2 | Watcher/schedule observability | Add run history details, convergence diagnostics, and failure summaries | Users can explain why automation did or did not run |
+
+Recommended sequence:
+
+1. Add eval coverage before changing agent prompts or capability search weights.
+2. Improve error envelopes and retries before increasing automation autonomy.
+3. Expand watcher/schedule visibility after tool errors are understandable.
+
+## Phase 4: High-Fidelity File I/O
+
+Target: weeks 8-16
+
+Goal: build reliable import, edit, render, verify, and export workflows for high-value document formats without slowing normal attachment parsing.
+
+P2 work:
+
+| ID | Priority | Work | Deliverables | Acceptance Criteria |
+| --- | --- | --- | --- | --- |
+| F1 | P2 | File I/O foundation | Shared adapter contract, artifact store, document graph, edit plan, fixture layout, render verifier interface | Two toy adapters can import, edit, export, and verify through the same contract |
+| F2 | P2 | DOCX adapter MVP | Preserve paragraphs, runs, styles, tables, images, comments, headers/footers where supported | Five fixtures pass import/export/render verification and unsupported constructs are explicit |
+| F3 | P2 | XLSX adapter MVP | Preserve sheets, formulas, styles, tables, charts, images, merged cells, and validation metadata where supported | Five fixtures pass recalculation-aware and rendered verification |
+| F4 | P2 | PPTX adapter MVP | Preserve slide masters, layouts, shapes, text runs, images, charts, tables, notes, and media refs where supported | Five fixtures pass slide-image verification and package integrity checks |
+| F5 | P2 | PDF intake and export | Extract text with coordinates, render pages, support OCR fallback, annotations, page assembly, and redaction-aware export | Generated PDFs have page count, dimensions, text coverage, annotation, and visual-diff checks |
+| F6 | P2 | HTML adapter | Preserve DOM, CSS, links, assets, tables, headings, and accessibility attributes | Browser-backed verification checks DOM validity, assets, links, text, and screenshots |
+| F7 | P2 | User-facing UI | Add artifact previews, limitations, diff/verification summaries, and export affordances | Users can inspect what changed before accepting an exported file |
+
+Non-goals for the first File I/O milestone:
+
+- Lossless editing of arbitrary PDFs as if they were semantic source files.
+- Legacy binary Office editing for `.doc`, `.xls`, or `.ppt`.
+- Treating Markdown, CSV, source code, or plain text as high-fidelity formats.
+- Pixel-perfect replication of every vendor-specific Office rendering quirk.
+
+## Phase 5: Plugin Ecosystem, Sandbox, And Trust
+
+Target: ongoing, after Phase 1 guardrails
+
+P2/P3 work:
+
+| ID | Priority | Work | Deliverables | Acceptance Criteria |
+| --- | --- | --- | --- | --- |
+| E1 | P2 | Plugin registry trust path | Improve package signing, verification, rollback, outdated checks, and registry metadata | Installs fail closed on signature or version mismatch |
+| E2 | P2 | Plugin developer loop | Tighten `tools create`, `tools dev`, frontend proxy, docs, and generated examples | A new plugin can be created, hot-reloaded, packaged, verified, and documented in one flow |
+| E3 | P2 | Sandbox smoke suite | Env-gated sandbox integration tests for provisioning, built-ins, bridge auth, secrets, plugin registration, and artifact integrity | Sandbox changes have a reproducible local test path without running in normal CI |
+| E4 | P3 | Marketplace polish | Better plugin discovery, screenshots/docs surfacing, compatibility badges, and good-first plugin examples | Users can choose trustworthy plugins without reading source first |
+| E5 | P3 | Remote agent maturity | Pairing UX, tunnel observability, revocation flows, and cross-instance communication design | Remote access remains understandable and revocable |
+
+## Security And Privacy Backlog
+
+These items should be pulled forward whenever related code is touched:
+
+- Encrypt or wrap VecturaKit vector index storage, or document a stronger mitigation if pluggable encryption remains blocked.
+- Add threat-model checklists for sandbox bridge routes, remote pairing, plugin HTTP routes, and relay tunnels.
+- Keep redaction tests current for access keys, bearer tokens, provider keys, sandbox bridge tokens, and plugin secrets.
+- Require explicit user-visible failure modes for unsupported file I/O features, plugin install risks, and storage recovery gaps.
+- Audit long-lived plugin databases and WAL checkpoint behavior.
+
+## Documentation Backlog
+
+- Add a short architecture decision record template for dependency pins, storage migrations, API compatibility changes, and sandbox security changes.
+- Keep `docs/FEATURES.md` as the feature inventory, but use this plan for forward-looking priority.
+- Add release checklists that connect docs, compatibility artifacts, appcast generation, acknowledgements, and signing.
+- Add "known limitations" sections to major docs that do not currently state them.
+
+## Definition Of Done
+
+Code changes are done when:
+
+- The change follows the layer rules in `docs/CONTRIBUTING.md`.
+- Unit or integration tests cover the changed behavior, or the PR explains why tests are not reasonable.
+- Public API, tool, storage, plugin, or file format changes update docs and fixtures.
+- Security-sensitive changes include redaction, permission, failure-mode, and rollback thinking.
+- UI changes include screenshots or recordings and check accessibility basics.
+- Local verification commands are listed in the PR test plan.
+
+Feature milestones are done when:
+
+- The feature has a clear owner-facing doc page or an explicit entry in an existing doc.
+- Unsupported cases fail loudly and usefully.
+- Observability exists for common failure modes.
+- Evals or compatibility scripts cover behavior that depends on model/provider output.
+- Rollback or migration behavior is documented when data or compatibility is affected.
+
+## Near-Term Sprint Breakdown
+
+### Sprint 1
+
+- Land this development plan and documentation cleanup.
+- Fix stale development instructions and broken contribution links.
+- Create or refresh API compatibility scripts around the current `results/openai_compat_report.md` workflow.
+- Add missing golden tests for recent provider request encoding and tool serialization regressions.
+
+### Sprint 2
+
+- Add Open Responses and Anthropic compatibility fixtures alongside OpenAI Chat Completions.
+- Expand `Packages/OsaurusEvals/Suites/Preflight` and create first `AgentLoop` smoke cases.
+- Document storage recovery and vector-index limitations in release notes/checklists.
+
+### Sprint 3
+
+- Begin the architecture split with a pure foundation target proposal.
+- Move or wrap MLX/VLM imports that leak into pure model code.
+- Add target-specific CI or Makefile commands once the first split compiles.
+
+### Sprint 4
+
+- Start File I/O foundation work behind an internal feature flag.
+- Build fixture layout and render verifier scaffolding before format-specific features.
+- Implement DOCX as the first rich editable adapter only after the shared contract survives fixture tests.
+
+## Planning Rules
+
+- Prefer reliability and testability over adding a new surface area.
+- Treat docs as part of the product contract.
+- Add feature flags for large risky changes, especially inference, storage, sandbox, and File I/O.
+- Keep PRs small enough to review against one workstream.
+- Update this plan when a milestone completes or when priority changes.
diff --git a/docs/FEATURES.md b/docs/FEATURES.md
index 7737c83ad..db6ea1696 100644
--- a/docs/FEATURES.md
+++ b/docs/FEATURES.md
@@ -1122,6 +1122,7 @@ Eight settings total, down from v1's 18. The per-section budget knobs, MMR tunin
 | -------------------------------------------------------------- | ------------------------------------------------- |
 | [README.md](../README.md)                                      | Project overview, quick start, feature highlights |
 | [FEATURES.md](FEATURES.md)                                     | Feature inventory and architecture (this file)    |
+| [DEVELOPMENT_PLAN.md](DEVELOPMENT_PLAN.md)                     | Prioritized roadmap and development plan          |
 | [WATCHERS.md](WATCHERS.md)                                     | Watchers and folder monitoring guide              |
 | [AGENT_LOOP.md](AGENT_LOOP.md)                                 | Agent loop, folder context, and `todo`/`complete`/`clarify` |
 | [REMOTE_PROVIDERS.md](REMOTE_PROVIDERS.md)                     | Remote provider setup and configuration           |