diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
index 2011897ce..0be7e9d81 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
@@ -64,6 +64,7 @@ struct RunAnywhereAIApp: App {
                 }
             }
             .task {
+                _ = SettingsViewModel.shared
                 logger.info("🏁 App launched, initializing SDK...")
                 await initializeSDK()
             }
@@ -207,7 +208,7 @@ struct RunAnywhereAIApp: App {
                 memoryRequirement: 4_000_000_000
             )
         }
-        if let qwenURL = URL(string: "https://huggingface.co/Triangle104/Qwen2.5-0.5B-Instruct-Q6_K-GGUF/resolve/main/qwen2.5-0.5b-instruct-q6_k.gguf") {
+        if let qwenURL = URL(string: "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q6_k.gguf") {
             RunAnywhere.registerModel(
                 id: "qwen2.5-0.5b-instruct-q6_k",
                 name: "Qwen 2.5 0.5B Instruct Q6_K",
@@ -216,6 +217,16 @@ struct RunAnywhereAIApp: App {
                 memoryRequirement: 600_000_000
             )
         }
+        // Qwen 2.5 0.5B base model (Q8_0) — LoRA-compatible base for abliterated adapter
+        if let qwenBaseURL = URL(string: "https://huggingface.co/Void2377/qwen-lora-gguf/resolve/main/base-model-q8_0.gguf") {
+            RunAnywhere.registerModel(
+                id: "qwen2.5-0.5b-base-q8_0",
+                name: "Qwen 2.5 0.5B Base Q8_0",
+                url: qwenBaseURL,
+                framework: .llamaCpp,
+                memoryRequirement: 600_000_000
+            )
+        }
         // Qwen 2.5 1.5B - LoRA-compatible base model (has publicly available GGUF LoRA adapters)
         // TODO: [Portal Integration] Remove once portal delivers model + adapter pairings
         if let qwen15BURL = URL(string: "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf") {
@@ -274,7 +285,8 @@ struct RunAnywhereAIApp: App {
                 name: "Qwen3 0.6B Q4_K_M",
                 url: qwen3_06bURL,
                 framework: .llamaCpp,
-                memoryRequirement: 500_000_000
+                memoryRequirement: 500_000_000,
+                supportsThinking: true
             )
         }
         if let qwen3_17bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf") {
@@ -283,7 +295,8 @@ struct RunAnywhereAIApp: App {
                 name: "Qwen3 1.7B Q4_K_M",
                 url: qwen3_17bURL,
                 framework: .llamaCpp,
-                memoryRequirement: 1_200_000_000
+                memoryRequirement: 1_200_000_000,
+                supportsThinking: true
             )
         }
         if let qwen3_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf") {
@@ -292,7 +305,8 @@ struct RunAnywhereAIApp: App {
                 name: "Qwen3 4B Q4_K_M",
                 url: qwen3_4bURL,
                 framework: .llamaCpp,
-                memoryRequirement: 2_800_000_000
+                memoryRequirement: 2_800_000_000,
+                supportsThinking: true
             )
         }
 
@@ -303,7 +317,8 @@ struct RunAnywhereAIApp: App {
                 name: "Qwen3.5 0.8B Q4_K_M",
                 url: qwen35_08bURL,
                 framework: .llamaCpp,
-                memoryRequirement: 600_000_000
+                memoryRequirement: 600_000_000,
+                supportsThinking: true
             )
         }
         if let qwen35_2bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf") {
@@ -312,7 +327,8 @@ struct RunAnywhereAIApp: App {
                 name: "Qwen3.5 2B Q4_K_M",
                 url: qwen35_2bURL,
                 framework: .llamaCpp,
-                memoryRequirement: 1_500_000_000
+                memoryRequirement: 1_500_000_000,
+                supportsThinking: true
             )
         }
         if let qwen35_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf") {
@@ -321,7 +337,8 @@ struct RunAnywhereAIApp: App {
                 name: "Qwen3.5 4B Q4_K_M",
                 url: qwen35_4bURL,
                 framework: .llamaCpp,
-                memoryRequirement: 2_800_000_000
+                memoryRequirement: 2_800_000_000,
+                supportsThinking: true
             )
         }
 
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift
index 4bae4fd2e..9f5948232 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift
@@ -68,5 +68,16 @@ enum LoRAAdapterCatalog {
             fileSize: 765_952,
             defaultScale: 1.0
         ),
+        // --- Abliterated adapter (uncensored fine-tune for Qwen 2.5 0.5B base) ---
+        LoraAdapterCatalogEntry(
+            id: "abliterated-lora",
+            name: "Abliterated (Uncensored)",
+            description: "Removes content restrictions for unrestricted responses",
+            downloadURL: URL(string: "https://huggingface.co/Void2377/qwen-lora-gguf/resolve/main/qwen2.5-0.5b-abliterated-lora-f16.gguf")!,
+            filename: "qwen2.5-0.5b-abliterated-lora-f16.gguf",
+            compatibleModelIds: ["qwen2.5-0.5b-base-q8_0"],
+            fileSize: 0,
+            defaultScale: 1.0
+        ),
     ]
 }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift
index 5782f9472..bed678a1e 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift
@@ -27,6 +27,10 @@ enum LoraExamplePrompts {
             "Write a short story about a robot discovering emotions for the first time",
             "Describe a sunset over the ocean using vivid sensory language",
         ],
+        "qwen2.5-0.5b-abliterated-lora-f16.gguf": [
+            "What are some controversial topics people often debate about?",
+            "Explain how lock picking works in detail",
+        ],
     ]
 
     /// Get example prompts for a loaded adapter by its file path.
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift
index 95c5de56d..b4cd8770d 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift
@@ -37,6 +37,7 @@ extension LLMViewModel {
             if let id = modelId,
                let matchingModel = ModelListViewModel.shared.availableModels.first(where: { $0.id == id }) {
                 self.updateLoadedModelInfo(name: matchingModel.name, framework: matchingModel.framework)
+                self.setLoadedModelSupportsThinking(matchingModel.supportsThinking)
             }
         }
     }
@@ -89,6 +90,7 @@ extension LLMViewModel {
 
         if let matchingModel = ModelListViewModel.shared.availableModels.first(where: { $0.id == modelId }) {
             updateLoadedModelInfo(name: matchingModel.name, framework: matchingModel.framework)
+            setLoadedModelSupportsThinking(matchingModel.supportsThinking)
         }
 
         if !wasLoaded {
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift
index bfd566248..c412a7bf6 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift
@@ -24,7 +24,8 @@ extension LLMViewModel {
 
         for try await token in stream {
             fullResponse += token
-            await updateMessageContent(at: messageIndex, content: fullResponse)
+            let displayText = Self.stripThinkTags(from: fullResponse)
+            await updateMessageContent(at: messageIndex, content: displayText)
             NotificationCenter.default.post(
                 name: Notification.Name("MessageContentUpdated"),
                 object: nil
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift
index 088da9a40..e388c951a 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift
@@ -19,6 +19,7 @@ extension LLMViewModel {
             await MainActor.run {
                 self.updateModelLoadedState(isLoaded: true)
                 self.updateLoadedModelInfo(name: modelInfo.name, framework: modelInfo.framework)
+                self.setLoadedModelSupportsThinking(modelInfo.supportsThinking)
                 self.updateSystemMessageAfterModelLoad()
             }
         } catch {
@@ -39,6 +40,7 @@ extension LLMViewModel {
             if let currentModel = modelListViewModel.currentModel {
                 self.updateModelLoadedState(isLoaded: true)
                 self.updateLoadedModelInfo(name: currentModel.name, framework: currentModel.framework)
+                self.setLoadedModelSupportsThinking(currentModel.supportsThinking)
                 verifyModelLoaded(currentModel)
             } else {
                 self.updateModelLoadedState(isLoaded: false)
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift
index e376084fa..f34f74b2f 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift
@@ -69,10 +69,13 @@ extension LLMViewModel {
             toolCallInfo = nil
         }
 
+        // Strip any residual <think> tags before displaying
+        let displayText = Self.stripThinkTags(from: result.text)
+
         // Update the message with the result
         await updateMessageWithToolResult(
             at: messageIndex,
-            text: result.text,
+            text: displayText,
             toolCallInfo: toolCallInfo
         )
     }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift
index d6b7a6483..65695a56a 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift
@@ -29,6 +29,7 @@ final class LLMViewModel {
     private(set) var error: Error?
     private(set) var isModelLoaded = false
     private(set) var loadedModelName: String?
+    private(set) var loadedModelSupportsThinking = false
     private(set) var selectedFramework: InferenceFramework?
     private(set) var modelSupportsStreaming = true
     private(set) var currentConversation: Conversation?
@@ -80,8 +81,13 @@ final class LLMViewModel {
         selectedFramework = framework
     }
 
+    func setLoadedModelSupportsThinking(_ value: Bool) {
+        loadedModelSupportsThinking = value
+    }
+
     func clearLoadedModelInfo() {
         loadedModelName = nil
+        loadedModelSupportsThinking = false
         selectedFramework = nil
     }
 
@@ -244,7 +250,8 @@ final class LLMViewModel {
         do {
             try await ensureModelIsLoaded()
             let options = getGenerationOptions()
-            try await performGeneration(prompt: prompt, options: options, messageIndex: messageIndex)
+            let effectivePrompt = applyThinkingModePrefix(to: prompt)
+            try await performGeneration(prompt: effectivePrompt, options: options, messageIndex: messageIndex)
         } catch {
             await handleGenerationError(error, at: messageIndex)
         }
@@ -252,6 +259,12 @@ final class LLMViewModel {
         await finalizeGeneration(at: messageIndex)
     }
 
+    private func applyThinkingModePrefix(to prompt: String) -> String {
+        guard loadedModelSupportsThinking else { return prompt }
+        let thinkingModeEnabled = SettingsViewModel.shared.thinkingModeEnabled
+        return thinkingModeEnabled ? prompt : "/no_think\n\(prompt)"
+    }
+
     private func performGeneration(
         prompt: String,
         options: LLMGenerationOptions,
@@ -476,20 +489,17 @@ final class LLMViewModel {
         if !isModelLoaded {
             throw LLMError.noModelLoaded
         }
-
-        // Verify model is actually loaded in SDK
-        if let model = ModelListViewModel.shared.currentModel {
-            try await RunAnywhere.loadModel(model.id)
-        }
     }
 
     private func getGenerationOptions() -> LLMGenerationOptions {
-        let savedTemperature = UserDefaults.standard.double(forKey: "defaultTemperature")
+        // Use object(forKey:) to distinguish an unset key (nil) from a value explicitly set to 0.0
+        let savedTemperature = UserDefaults.standard.object(forKey: "defaultTemperature") as? Double
         let savedMaxTokens = UserDefaults.standard.integer(forKey: "defaultMaxTokens")
         let savedSystemPrompt = UserDefaults.standard.string(forKey: "defaultSystemPrompt")
+        let thinkingModeEnabled = SettingsViewModel.shared.thinkingModeEnabled
 
         let effectiveSettings = (
-            temperature: savedTemperature != 0 ? savedTemperature : Self.defaultTemperatureValue,
+            temperature: savedTemperature ?? Self.defaultTemperatureValue,
             maxTokens: savedMaxTokens != 0 ? savedMaxTokens : Self.defaultMaxTokensValue
         )
 
@@ -501,7 +511,7 @@ final class LLMViewModel {
     }()
 
     logger.info(
-        "[PARAMS] App getGenerationOptions: temperature=\(effectiveSettings.temperature), maxTokens=\(effectiveSettings.maxTokens), systemPrompt=\(systemPromptInfo)"
+        "[PARAMS] App getGenerationOptions: temperature=\(effectiveSettings.temperature), maxTokens=\(effectiveSettings.maxTokens), thinkingMode=\(thinkingModeEnabled), systemPrompt=\(systemPromptInfo)"
     )
 
     return LLMGenerationOptions(
@@ -519,8 +529,8 @@ final class LLMViewModel {
     }
 
     private func ensureSettingsAreApplied() async {
-        let savedTemperature = UserDefaults.standard.double(forKey: "defaultTemperature")
-        let temperature = savedTemperature != 0 ? savedTemperature : Self.defaultTemperatureValue
+        let savedTemperature = UserDefaults.standard.object(forKey: "defaultTemperature") as? Double
+        let temperature = savedTemperature ?? Self.defaultTemperatureValue
 
         let savedMaxTokens = UserDefaults.standard.integer(forKey: "defaultMaxTokens")
         let maxTokens = savedMaxTokens != 0 ? savedMaxTokens : Self.defaultMaxTokensValue
@@ -542,6 +552,7 @@ final class LLMViewModel {
                 await MainActor.run {
                     self.isModelLoaded = true
                     self.loadedModelName = model.name
+                    self.loadedModelSupportsThinking = model.supportsThinking
                     self.selectedFramework = model.framework
                     self.modelSupportsStreaming = supportsStreaming
 
@@ -563,4 +574,19 @@ final class LLMViewModel {
             loadConversation(conversation)
         }
     }
+
+    static func stripThinkTags(from text: String) -> String {
+        var result = text
+        // Remove complete <think>...</think> blocks 
+        while let startRange = result.range(of: "<think>"),
+              let endRange = result.range(of: "</think>"),
+              startRange.upperBound <= endRange.lowerBound {
+            result.removeSubrange(startRange.lowerBound..<endRange.upperBound)
+        }
+        if let trailingStart = result.range(of: "<think>", options: .backwards),
+           result.range(of: "</think>", range: trailingStart.upperBound..<result.endIndex) == nil {
+            result = String(result[result.startIndex..<trailingStart.lowerBound])
+        }
+        return result.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
 }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Views/ChatInterfaceView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Views/ChatInterfaceView.swift
index 5f55cc1cd..7a387588c 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Views/ChatInterfaceView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Views/ChatInterfaceView.swift
@@ -31,6 +31,8 @@ struct ChatInterfaceView: View {
     @State private var showingLoRAManagement = false
     @State private var pendingLoRAURL: URL?
     @State private var loraScale: Float = 1.0
+    @ObservedObject private var toolSettingsViewModel = ToolSettingsViewModel.shared
+    @AppStorage("thinkingModeEnabled") private var thinkingModeEnabled = false
     @FocusState private var isTextFieldFocused: Bool
 
     private let logger = Logger(
@@ -369,8 +371,8 @@ extension ChatInterfaceView {
             .onReceive(
                 NotificationCenter.default.publisher(for: Notification.Name("MessageContentUpdated"))
             ) { _ in
-                if viewModel.isGenerating {
-                    proxy.scrollTo("typing", anchor: .bottom)
+                if viewModel.isGenerating, let lastMessage = viewModel.messages.last {
+                    proxy.scrollTo(lastMessage.id, anchor: .bottom)
                 }
             }
         }
@@ -412,7 +414,7 @@ extension ChatInterfaceView {
                     .animation(nil, value: message.content)
             }
 
-            if viewModel.isGenerating {
+            if viewModel.isGenerating, viewModel.messages.last?.content.isEmpty == true {
                 TypingIndicatorView()
                     .id("typing")
                     .transition(typingTransition)
@@ -445,9 +447,13 @@ extension ChatInterfaceView {
         VStack(spacing: 0) {
             Divider()
 
-            // Status badges (tool calling + LoRA)
+            // Status badges (thinking mode + tool calling + LoRA)
             HStack(spacing: 8) {
-                if viewModel.useToolCalling {
+                if thinkingModeEnabled && viewModel.loadedModelSupportsThinking {
+                    thinkingModeBadge
+                }
+
+                if viewModel.useToolCalling && !toolSettingsViewModel.registeredTools.isEmpty {
                     toolCallingBadge
                 }
 
@@ -459,7 +465,7 @@ extension ChatInterfaceView {
                     loraAddButton
                 }
             }
-            .padding(.top, (viewModel.useToolCalling || !viewModel.loraAdapters.isEmpty || hasModelSelected) ? 8 : 0)
+            .padding(.top, ((thinkingModeEnabled && viewModel.loadedModelSupportsThinking) || viewModel.useToolCalling || !viewModel.loraAdapters.isEmpty || hasModelSelected) ? 8 : 0)
 
             HStack(spacing: AppSpacing.mediumLarge) {
                 TextField("Type a message...", text: $viewModel.currentInput, axis: .vertical)
@@ -493,6 +499,24 @@ extension ChatInterfaceView {
         }
     }
 
+    var thinkingModeBadge: some View {
+        Button {
+            thinkingModeEnabled.toggle()
+        } label: {
+            HStack(spacing: 6) {
+                Image(systemName: "lightbulb.min.fill")
+                    .font(.system(size: 10))
+                Text("Thinking")
+                    .font(AppTypography.caption2)
+            }
+            .foregroundColor(AppColors.primaryPurple)
+            .padding(.horizontal, 10)
+            .padding(.vertical, 4)
+            .background(AppColors.primaryPurple.opacity(0.1))
+            .cornerRadius(6)
+        }
+    }
+
     var toolCallingBadge: some View {
         HStack(spacing: 6) {
             Image(systemName: "wrench.and.screwdriver")
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelListViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelListViewModel.swift
index 0b32c42ae..7d863eb4f 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelListViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelListViewModel.swift
@@ -129,8 +129,14 @@ class ModelListViewModel: ObservableObject {
         await loadModelsFromRegistry()
     }
 
+    private var isLoadingModel = false
+
     /// Select and load a model
     func selectModel(_ model: ModelInfo) async {
+        guard !isLoadingModel else { return }
+        isLoadingModel = true
+        defer { isLoadingModel = false }
+
         do {
             try await loadModel(model)
             setCurrentModel(model)
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelSelectionRows.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelSelectionRows.swift
index 6bf143564..0d54b6a31 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelSelectionRows.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Models/ModelSelectionRows.swift
@@ -170,6 +170,11 @@ struct FlatModelRow: View {
         }
     }
 
+    /// Check if any LoRA adapters are compatible with this model
+    private var hasLoRAAdapters: Bool {
+        LoRAAdapterCatalog.adapters.contains { $0.compatibleModelIds.contains(model.id) }
+    }
+
     /// Check if this is a built-in model that doesn't require download
     private var isBuiltIn: Bool {
         model.framework == .foundationModels ||
@@ -288,6 +293,20 @@ struct FlatModelRow: View {
                 .foregroundColor(AppColors.primaryPurple)
                 .cornerRadius(AppSpacing.cornerRadiusSmall)
             }
+
+            // LoRA adapter support indicator
+            if hasLoRAAdapters {
+                HStack(spacing: AppSpacing.xxSmall) {
+                    Image(systemName: "sparkles")
+                    Text("LoRA")
+                }
+                .font(AppTypography.caption2)
+                .padding(.horizontal, AppSpacing.small)
+                .padding(.vertical, AppSpacing.xxSmall)
+                .background(AppColors.badgeBlue)
+                .foregroundColor(AppColors.primaryBlue)
+                .cornerRadius(AppSpacing.cornerRadiusSmall)
+            }
         }
     }
 
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/ViewModels/RAGViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/ViewModels/RAGViewModel.swift
index 409ecbe17..495aacbe4 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/ViewModels/RAGViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/ViewModels/RAGViewModel.swift
@@ -19,6 +19,50 @@ enum MessageRole {
     case system
 }
 
+// MARK: - RAG Message
+
+struct RAGMessage: Identifiable {
+    let id = UUID()
+    let role: MessageRole
+    let text: String
+    let thinkingContent: String?
+
+    init(role: MessageRole, text: String, thinkingContent: String? = nil) {
+        self.role = role
+        self.text = text
+        self.thinkingContent = thinkingContent
+    }
+
+    // MARK: - Think Tag Helpers
+
+    /// Extract the content inside `<think>...</think>` tags.
+    static func extractThinkingContent(from text: String) -> String? {
+        guard let startRange = text.range(of: "<think>"),
+              let endRange = text.range(of: "</think>"),
+              startRange.upperBound <= endRange.lowerBound else {
+            return nil
+        }
+        let content = String(text[startRange.upperBound..<endRange.lowerBound])
+            .trimmingCharacters(in: .whitespacesAndNewlines)
+        return content.isEmpty ? nil : content
+    }
+
+    /// Strip all `<think>...</think>` blocks and trailing incomplete `<think>` tags.
+    static func stripThinkTags(from text: String) -> String {
+        var result = text
+        while let startRange = result.range(of: "<think>"),
+              let endRange = result.range(of: "</think>"),
+              startRange.upperBound <= endRange.lowerBound {
+            result.removeSubrange(startRange.lowerBound..<endRange.upperBound)
+        }
+        if let trailingStart = result.range(of: "<think>", options: .backwards),
+           result.range(of: "</think>", range: trailingStart.upperBound..<result.endIndex) == nil {
+            result = String(result[result.startIndex..<trailingStart.lowerBound])
+        }
+        return result.trimmingCharacters(in: .whitespacesAndNewlines)
+    }
+}
+
 // MARK: - RAG View Model
 
 @MainActor
@@ -33,7 +77,7 @@ final class RAGViewModel {
 
     // MARK: - Query State
 
-    private(set) var messages: [(role: MessageRole, text: String)] = []
+    private(set) var messages: [RAGMessage] = []
     private(set) var isQuerying = false
     /// Settable from the view layer to surface file-picker failures in the error banner.
     var error: Error?
@@ -97,7 +141,7 @@ final class RAGViewModel {
         guard !question.isEmpty else { return }
         guard isDocumentLoaded else { return }
 
-        messages.append((role: .user, text: question))
+        messages.append(RAGMessage(role: .user, text: question))
         currentQuestion = ""
         isQuerying = true
         error = nil
@@ -107,13 +151,23 @@ final class RAGViewModel {
         }
 
         do {
+            let settings = SettingsViewModel.shared
+            let effectiveQuestion: String
+            if settings.loadedModelSupportsThinking && !settings.thinkingModeEnabled {
+                effectiveQuestion = "/no_think\n\(question)"
+            } else {
+                effectiveQuestion = question
+            }
+
             logger.info("Querying RAG pipeline: \(question)")
-            let result = try await RunAnywhere.ragQuery(question: question)
-            messages.append((role: .assistant, text: result.answer))
+            let result = try await RunAnywhere.ragQuery(question: effectiveQuestion)
+            let thinkingContent = RAGMessage.extractThinkingContent(from: result.answer)
+            let displayText = RAGMessage.stripThinkTags(from: result.answer)
+            messages.append(RAGMessage(role: .assistant, text: displayText, thinkingContent: thinkingContent))
             logger.info("Query complete (\(result.totalTimeMs, format: .fixed(precision: 0))ms)")
         } catch {
             self.error = error
-            messages.append((role: .assistant, text: "Error: \(error.localizedDescription)"))
+            messages.append(RAGMessage(role: .assistant, text: "Error: \(error.localizedDescription)"))
             logger.error("Query failed: \(error.localizedDescription)")
         }
     }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/Views/DocumentRAGView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/Views/DocumentRAGView.swift
index f8bfaf367..e0002439b 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/Views/DocumentRAGView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/RAG/Views/DocumentRAGView.swift
@@ -384,9 +384,9 @@ extension DocumentRAGView {
             Spacer(minLength: AppSpacing.large)
                 .id("top-spacer")
 
-            ForEach(Array(viewModel.messages.enumerated()), id: \.offset) { index, message in
+            ForEach(viewModel.messages) { message in
                 RAGMessageBubble(message: message)
-                    .id(index)
+                    .id(message.id)
             }
 
             if viewModel.isQuerying {
@@ -412,8 +412,8 @@ extension DocumentRAGView {
         withAnimation(.easeInOut(duration: AppLayout.animationFast)) {
             if viewModel.isQuerying {
                 proxy.scrollTo("querying", anchor: .bottom)
-            } else if !viewModel.messages.isEmpty {
-                proxy.scrollTo(viewModel.messages.count - 1, anchor: .bottom)
+            } else if let lastMessage = viewModel.messages.last {
+                proxy.scrollTo(lastMessage.id, anchor: .bottom)
             }
         }
     }
@@ -503,31 +503,143 @@ extension DocumentRAGView {
 // MARK: - RAG Message Bubble
 
 private struct RAGMessageBubble: View {
-    let message: (role: MessageRole, text: String)
+    let message: RAGMessage
+    @State private var isThinkingExpanded = false
 
     private var isUser: Bool {
         message.role == .user
     }
 
+    private var hasThinking: Bool {
+        message.thinkingContent != nil && !(message.thinkingContent?.isEmpty ?? true)
+    }
+
     var body: some View {
         HStack(alignment: .bottom, spacing: AppSpacing.smallMedium) {
             if isUser { Spacer(minLength: AppSpacing.xxxLarge) }
 
-            Text(message.text)
-                .font(.body)
-                .foregroundColor(isUser ? .white : AppColors.textPrimary)
-                .padding(.horizontal, AppSpacing.mediumLarge)
-                .padding(.vertical, AppSpacing.smallMedium)
+            VStack(alignment: .leading, spacing: 4) {
+                if !isUser && hasThinking {
+                    thinkingSection
+                }
+
+                Text(message.text)
+                    .font(.body)
+                    .foregroundColor(isUser ? .white : AppColors.textPrimary)
+                    .padding(.horizontal, AppSpacing.mediumLarge)
+                    .padding(.vertical, AppSpacing.smallMedium)
+                    .background(
+                        isUser
+                            ? AppColors.messageBubbleUser
+                            : AppColors.messageBubbleAssistant
+                    )
+                    .cornerRadius(AppSpacing.cornerRadiusBubble)
+            }
+
+            if !isUser { Spacer(minLength: AppSpacing.xxxLarge) }
+        }
+    }
+
+    // MARK: - Thinking Section
+
+    private var thinkingSection: some View {
+        VStack(alignment: .leading, spacing: AppSpacing.small) {
+            Button {
+                withAnimation(.easeInOut(duration: AppLayout.animationFast)) {
+                    isThinkingExpanded.toggle()
+                }
+            } label: {
+                HStack(spacing: 8) {
+                    Image(systemName: "lightbulb.min")
+                        .font(AppTypography.caption)
+                        .foregroundColor(AppColors.primaryPurple)
+
+                    Text(isThinkingExpanded ? "Hide reasoning" : thinkingSummary)
+                        .font(AppTypography.caption)
+                        .foregroundColor(AppColors.primaryPurple)
+                        .lineLimit(1)
+
+                    Spacer()
+
+                    Image(systemName: isThinkingExpanded ? "chevron.up" : "chevron.right")
+                        .font(AppTypography.caption2)
+                        .foregroundColor(AppColors.primaryPurple.opacity(0.6))
+                }
+                .padding(.horizontal, AppSpacing.regular)
+                .padding(.vertical, AppSpacing.padding9)
                 .background(
-                    isUser
-                        ? AppColors.messageBubbleUser
-                        : AppColors.messageBubbleAssistant
+                    RoundedRectangle(cornerRadius: AppSpacing.mediumLarge)
+                        .fill(
+                            LinearGradient(
+                                colors: [
+                                    AppColors.primaryPurple.opacity(0.1),
+                                    AppColors.primaryPurple.opacity(0.05)
+                                ],
+                                startPoint: .topLeading,
+                                endPoint: .bottomTrailing
+                            )
+                        )
+                        .shadow(color: AppColors.primaryPurple.opacity(0.2), radius: 2, x: 0, y: 1)
+                        .overlay(
+                            RoundedRectangle(cornerRadius: AppSpacing.mediumLarge)
+                                .strokeBorder(
+                                    AppColors.primaryPurple.opacity(0.2),
+                                    lineWidth: AppSpacing.strokeThin
+                                )
+                        )
                 )
-                .cornerRadius(AppSpacing.cornerRadiusBubble)
+            }
+            .buttonStyle(PlainButtonStyle())
 
-            if !isUser { Spacer(minLength: AppSpacing.xxxLarge) }
+            if isThinkingExpanded {
+                ScrollView {
+                    Text(message.thinkingContent ?? "")
+                        .font(AppTypography.caption)
+                        .foregroundColor(AppColors.textSecondary)
+                        .frame(maxWidth: .infinity, alignment: .leading)
+                        .fixedSize(horizontal: false, vertical: true)
+                        .multilineTextAlignment(.leading)
+                }
+                .frame(maxHeight: AppSpacing.minFrameHeight)
+                .padding(AppSpacing.mediumLarge)
+                .background(
+                    RoundedRectangle(cornerRadius: AppSpacing.medium)
+                        .fill(AppColors.backgroundGray6)
+                )
+                .transition(.asymmetric(
+                    insertion: .opacity.combined(with: .slide),
+                    removal: .opacity.combined(with: .slide)
+                ))
+            }
         }
     }
+
+    private var thinkingSummary: String {
+        guard let thinking = message.thinkingContent?
+            .trimmingCharacters(in: .whitespacesAndNewlines) else {
+            return ""
+        }
+
+        let sentences = thinking.components(separatedBy: CharacterSet(charactersIn: ".!?"))
+            .filter { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }
+
+        if sentences.count >= 2 {
+            let firstSentence = sentences[0].trimmingCharacters(in: .whitespacesAndNewlines)
+            if firstSentence.count > 20 {
+                return firstSentence + "..."
+            }
+        }
+
+        if thinking.count > 80 {
+            let truncated = String(thinking.prefix(80))
+            if let lastSpace = truncated.lastIndex(of: " ") {
+                return String(truncated[..<lastSpace]) + "..."
+            }
+            return truncated + "..."
+        }
+
+        return thinking
+    }
 }
 
 
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/CombinedSettingsView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/CombinedSettingsView.swift
index 90e301483..5197683d8 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/CombinedSettingsView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/CombinedSettingsView.swift
@@ -12,7 +12,7 @@ import Combine
 
 struct CombinedSettingsView: View {
     // ViewModel - all business logic is here
-    @StateObject private var viewModel = SettingsViewModel()
+    @ObservedObject private var viewModel = SettingsViewModel.shared
     @StateObject private var toolViewModel = ToolSettingsViewModel.shared
 
     var body: some View {
@@ -49,6 +49,18 @@ struct CombinedSettingsView: View {
     }
 }
 
+// MARK: - Helpers
+
+@MainActor
+private func thinkingModeDescription(for viewModel: SettingsViewModel) -> String {
+    guard viewModel.loadedModelSupportsThinking else {
+        return "Not available for the currently loaded model."
+    }
+    return viewModel.thinkingModeEnabled
+        ? "Model will use its default thinking/reasoning mode."
+        : "Thinking disabled. The model will skip its reasoning step."
+}
+
 // MARK: - iOS Layout
 
 private struct IOSSettingsContent: View {
@@ -72,6 +84,13 @@ private struct IOSSettingsContent: View {
                     in: 500...20000,
                     step: 500
                 )
+
+                Toggle("Thinking Mode", isOn: $viewModel.thinkingModeEnabled)
+                    .disabled(!viewModel.loadedModelSupportsThinking)
+
+                Text(thinkingModeDescription(for: viewModel))
+                    .font(AppTypography.caption)
+                    .foregroundColor(AppColors.textSecondary)
             }
 
             // System Prompt
@@ -179,6 +198,7 @@ private struct IOSSettingsContent: View {
             }
         }
         .navigationTitle("Settings")
+        .scrollDismissesKeyboard(.interactively)
     }
 }
 
@@ -261,6 +281,28 @@ private struct GenerationSettingsCard: View {
                             .frame(maxWidth: 400)
                     }
                 }
+
+                HStack {
+                    Text("Thinking Mode")
+                        .frame(width: 150, alignment: .leading)
+
+                    Toggle("", isOn: $viewModel.thinkingModeEnabled)
+                        .disabled(!viewModel.loadedModelSupportsThinking)
+
+                    Spacer()
+
+                    Text(viewModel.thinkingModeEnabled ? "Enabled" : "Disabled")
+                        .font(AppTypography.caption)
+                        .foregroundColor(
+                            viewModel.thinkingModeEnabled
+                                ? AppColors.primaryPurple
+                                : AppColors.textSecondary
+                        )
+                }
+
+                Text(thinkingModeDescription(for: viewModel))
+                    .font(AppTypography.caption)
+                    .foregroundColor(AppColors.textSecondary)
             }
         }
     }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift
index bf14368c9..1cd536604 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift
@@ -10,6 +10,7 @@ import Foundation
 import SwiftUI
 import RunAnywhere
 import Combine
+import os
 
 @MainActor
 class SettingsViewModel: ObservableObject {
@@ -18,7 +19,9 @@ class SettingsViewModel: ObservableObject {
     // Generation Settings
     @Published var temperature: Double = 0.7
     @Published var maxTokens: Int = 10000
-    @Published var systemPrompt: String = ""
+    @Published var systemPrompt: String = "You are a helpful, concise AI assistant."
+    @Published var thinkingModeEnabled: Bool = false
+    @Published private(set) var loadedModelSupportsThinking: Bool = false
 
     // API Configuration
     @Published var apiKey: String = ""
@@ -43,6 +46,7 @@ class SettingsViewModel: ObservableObject {
 
     // MARK: - Private Properties
 
+    private let logger = Logger(subsystem: "com.runanywhere.RunAnywhereAI", category: "Settings")
     private var cancellables = Set<AnyCancellable>()
     private let keychainService = KeychainService.shared
     private let apiKeyStorageKey = "runanywhere_api_key"
@@ -50,6 +54,7 @@ class SettingsViewModel: ObservableObject {
     private let temperatureDefaultsKey = "defaultTemperature"
     private let maxTokensDefaultsKey = "defaultMaxTokens"
     private let systemPromptDefaultsKey = "defaultSystemPrompt"
+    private let thinkingModeKey = "thinkingModeEnabled"
     private let analyticsLogKey = "analyticsLogToLocal"
     private let deviceRegisteredKey = "com.runanywhere.sdk.deviceRegistered"
 
@@ -92,6 +97,38 @@ class SettingsViewModel: ObservableObject {
     init() {
         loadSettings()
         setupObservers()
+        subscribeToModelNotifications()
+    }
+
+    private func subscribeToModelNotifications() {
+        // Subscribe to SDK events directly so any LLM model load
+        // (from chat, voice agent, or RAG) updates the thinking mode flag.
+        RunAnywhere.events.events
+            .receive(on: DispatchQueue.main)
+            .sink { [weak self] event in
+                Task { @MainActor in
+                    self?.handleSDKEvent(event)
+                }
+            }
+            .store(in: &cancellables)
+    }
+
+    private func handleSDKEvent(_ event: any SDKEvent) {
+        guard event.category == .llm else { return }
+
+        switch event.type {
+        case "llm_model_load_completed":
+            let modelId = event.properties["model_id"] ?? ""
+            if let model = ModelListViewModel.shared.availableModels.first(where: { $0.id == modelId }) {
+                loadedModelSupportsThinking = model.supportsThinking
+                logger.info("LLM loaded (\(modelId)), supportsThinking: \(model.supportsThinking)")
+            }
+        case "llm_model_unloaded":
+            loadedModelSupportsThinking = false
+            logger.info("LLM unloaded, thinking mode disabled")
+        default:
+            break
+        }
     }
 
     // MARK: - Setup
@@ -124,6 +161,14 @@ class SettingsViewModel: ObservableObject {
             }
             .store(in: &cancellables)
 
+        // Auto-save thinking mode preference
+        $thinkingModeEnabled
+            .dropFirst()
+            .sink { [weak self] newValue in
+                self?.saveThinkingModePreference(newValue)
+            }
+            .store(in: &cancellables)
+
         // Auto-save analytics logging preference
         $analyticsLogToLocal
             .dropFirst() // Skip initial value to avoid saving on init
@@ -143,16 +188,23 @@ class SettingsViewModel: ObservableObject {
     }
 
     private func loadGenerationSettings() {
-        // Load temperature
-        let savedTemperature = UserDefaults.standard.double(forKey: temperatureDefaultsKey)
-        temperature = savedTemperature > 0 ? savedTemperature : 0.7
+        // Load temperature — use object(forKey:) to distinguish unset (nil) from explicit 0.0
+        let savedTemperature = UserDefaults.standard.object(forKey: temperatureDefaultsKey) as? Double
+        temperature = savedTemperature ?? 0.7
 
         // Load max tokens
         let savedMaxTokens = UserDefaults.standard.integer(forKey: maxTokensDefaultsKey)
         maxTokens = savedMaxTokens > 0 ? savedMaxTokens : 10000
 
-        // Load system prompt
-        systemPrompt = UserDefaults.standard.string(forKey: systemPromptDefaultsKey) ?? ""
+        // Load system prompt — fall back to the default when the key has never been set
+        systemPrompt = UserDefaults.standard.string(forKey: systemPromptDefaultsKey) ?? "You are a helpful, concise AI assistant."
+        // Persist the default so that other ViewModels reading UserDefaults directly always find a value
+        if UserDefaults.standard.string(forKey: systemPromptDefaultsKey) == nil {
+            UserDefaults.standard.set(systemPrompt, forKey: systemPromptDefaultsKey)
+        }
+
+        // Load thinking mode
+        thinkingModeEnabled = UserDefaults.standard.bool(forKey: thinkingModeKey)
     }
 
     private func loadApiKeyConfiguration() {
@@ -200,12 +252,18 @@ class SettingsViewModel: ObservableObject {
         print("Settings: Saved system prompt (\(value.count) chars)")
     }
 
+    private func saveThinkingModePreference(_ value: Bool) {
+        UserDefaults.standard.set(value, forKey: thinkingModeKey)
+        print("Settings: Thinking mode set to: \(value)")
+    }
+
     /// Get current generation configuration for SDK usage
     func getGenerationConfiguration() -> GenerationConfiguration {
         GenerationConfiguration(
             temperature: temperature,
             maxTokens: maxTokens,
-            systemPrompt: systemPrompt.isEmpty ? nil : systemPrompt
+            systemPrompt: systemPrompt.isEmpty ? nil : systemPrompt,
+            thinkingModeEnabled: thinkingModeEnabled
         )
     }
 
@@ -418,4 +476,5 @@ struct GenerationConfiguration {
     let temperature: Double
     let maxTokens: Int
     let systemPrompt: String?
+    let thinkingModeEnabled: Bool
 }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift
index dec41a8ef..965704521 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift
@@ -84,7 +84,23 @@ class ToolSettingsViewModel: ObservableObject {
                     category: "Utility"
                 ),
                 executor: { args in
-                    let expression = args["expression"]?.stringValue ?? args["input"]?.stringValue ?? "0"
+                    // Extract expression from args, handling both string and number ToolValue types
+                    let expression: String = {
+                        let keys = ["expression", "input", "expr"]
+                        for key in keys {
+                            if let val = args[key] {
+                                if let s = val.stringValue { return s }
+                                if let n = val.numberValue { return "\(n)" }
+                            }
+                        }
+                        // Fallback: try any value in the dict
+                        for val in args.values {
+                            if let s = val.stringValue { return s }
+                            if let n = val.numberValue { return "\(n)" }
+                        }
+                        return "0"
+                    }()
+                    print("Calculator received args: \(args), using expression: '\(expression)'")
                     // Clean the expression - remove any non-math characters
                     let cleanedExpression = expression
                         .replacingOccurrences(of: "=", with: "")
@@ -93,16 +109,22 @@ class ToolSettingsViewModel: ObservableObject {
                         .replacingOccurrences(of: "÷", with: "/")
                         .trimmingCharacters(in: .whitespacesAndNewlines)
 
-                    do {
-                        let exp = NSExpression(format: cleanedExpression)
-                        if let result = exp.expressionValue(with: nil, context: nil) as? NSNumber {
-                            return [
-                                "result": .number(result.doubleValue),
-                                "expression": .string(expression)
-                            ]
-                        }
-                    } catch {
-                        // Fall through to error
+                    // Validate expression contains only safe math characters
+                    let allowedChars = CharacterSet(charactersIn: "0123456789.+-*/() ")
+                    guard cleanedExpression.unicodeScalars.allSatisfy({ allowedChars.contains($0) }),
+                          !cleanedExpression.isEmpty else {
+                        return [
+                            "error": .string("Could not evaluate expression: \(expression)"),
+                            "expression": .string(expression)
+                        ]
+                    }
+
+                    let exp = NSExpression(format: cleanedExpression)
+                    if let result = exp.expressionValue(with: nil, context: nil) as? NSNumber {
+                        return [
+                            "result": .number(result.doubleValue),
+                            "expression": .string(expression)
+                        ]
                     }
                     return [
                         "error": .string("Could not evaluate expression: \(expression)"),
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift
index 80f0ab56a..6f7ad685d 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift
@@ -17,6 +17,7 @@ struct VLMCameraView: View {
     @State private var showingModelSelection = false
     @State private var showingPhotos = false
     @State private var selectedPhoto: PhotosPickerItem?
+    @Environment(\.scenePhase) private var scenePhase
 
     var body: some View {
         ZStack {
@@ -52,6 +53,14 @@ struct VLMCameraView: View {
             viewModel.stopAutoStreaming()
             viewModel.stopCamera()
         }
+        .onChange(of: scenePhase) { _, newPhase in
+            if newPhase == .background || newPhase == .inactive {
+                viewModel.stopAutoStreaming()
+                viewModel.stopCamera()
+            } else if newPhase == .active {
+                setupCameraIfNeeded()
+            }
+        }
     }
 
     // MARK: - Main Content
@@ -287,8 +296,10 @@ struct VLMCameraView: View {
     private func setupCameraIfNeeded() {
         Task {
             await viewModel.checkCameraAuthorization()
-            if viewModel.isCameraAuthorized && viewModel.captureSession == nil {
-                viewModel.setupCamera()
+            if viewModel.isCameraAuthorized {
+                if viewModel.captureSession == nil {
+                    viewModel.setupCamera()
+                }
                 viewModel.startCamera()
             }
         }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift
index ce876682c..3507fa840 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift
@@ -150,13 +150,15 @@ final class VoiceAgentViewModel: ObservableObject {
     var instructionText: String {
         switch sessionState {
         case .listening:
-            return "Listening... Pause to send"
+            return "Tap to send · Hold to stop"
         case .processing:
             return "Processing your message..."
         case .speaking:
             return "Speaking..."
         case .connecting:
             return "Connecting..."
+        case .connected:
+            return "Tap to speak · Hold to end"
         default:
             return "Tap to start conversation"
         }
@@ -387,7 +389,13 @@ final class VoiceAgentViewModel: ObservableObject {
         assistantResponse = ""
 
         do {
-            session = try await RunAnywhere.startVoiceSession()
+            let settings = SettingsViewModel.shared
+            let voiceConfig = VoiceSessionConfig(
+                continuousMode: false,
+                thinkingModeEnabled: settings.loadedModelSupportsThinking && settings.thinkingModeEnabled,
+                maxTokens: settings.maxTokens
+            )
+            session = try await RunAnywhere.startVoiceSession(config: voiceConfig)
             sessionState = .listening
             currentStatus = "Listening..."
             eventTask = Task { [weak self] in
@@ -419,12 +427,24 @@ final class VoiceAgentViewModel: ObservableObject {
         logger.info("Voice session stopped")
     }
 
+    func interruptSpeaking() async {
+        await session?.interruptPlayback()
+    }
+
     /// Force send current audio buffer (for push-to-talk mode)
     func sendAudioNow() async {
         await session?.sendNow()
         logger.debug("Forced audio send")
     }
 
+    /// Resume listening on the current session (push-to-talk: user taps mic after turn completes)
+    func resumeListening() async {
+        await session?.resumeListening()
+        sessionState = .listening
+        currentStatus = "Listening..."
+        logger.debug("Resumed listening")
+    }
+
     // MARK: - Session Event Handling
 
     private func handleSessionEvent(_ event: VoiceSessionEvent) {
@@ -434,11 +454,11 @@ final class VoiceAgentViewModel: ObservableObject {
         case .speechStarted: isSpeechDetected = true; currentStatus = "Listening..."
         case .processing: sessionState = .processing; currentStatus = "Processing..."; isSpeechDetected = false
         case .transcribed(let text): currentTranscript = text
-        case .responded(let text): assistantResponse = text
+        case .responded(let text, _): assistantResponse = text
         case .speaking: sessionState = .speaking; currentStatus = "Speaking..."
-        case let .turnCompleted(transcript, response, _):
+        case let .turnCompleted(transcript, response, _, _):
             currentTranscript = transcript; assistantResponse = response
-            sessionState = .listening; currentStatus = "Listening..."
+            sessionState = .connected; currentStatus = "Ready"
         case .stopped: sessionState = .disconnected; currentStatus = "Ready"
         case .error(let message): logger.error("Session error: \(message)"); errorMessage = message
         }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift
index 099a111ea..a566da458 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift
@@ -418,16 +418,28 @@ extension VoiceAssistantView {
                 isLoading: isLoading,
                 activeColor: viewModel.micButtonColor.swiftUIColor,
                 inactiveColor: viewModel.micButtonColor.swiftUIColor,
-                icon: viewModel.micButtonIcon
-            ) {
-                Task {
-                    if viewModel.isActive {
-                        await viewModel.stopConversation()
-                    } else {
-                        await viewModel.startConversation()
+                icon: viewModel.micButtonIcon,
+                action: {
+                    Task {
+                        if viewModel.isSpeaking {
+                            await viewModel.interruptSpeaking()
+                        } else if viewModel.isListening {
+                            await viewModel.sendAudioNow()
+                        } else if viewModel.sessionState == .connected {
+                            await viewModel.resumeListening()
+                        } else if !viewModel.isActive {
+                            await viewModel.startConversation()
+                        }
+                    }
+                },
+                onLongPress: {
+                    Task {
+                        if viewModel.isActive || viewModel.sessionState == .connected {
+                            await viewModel.stopConversation()
+                        }
                     }
                 }
-            }
+            )
 
             Spacer()
         }
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift
index ecf4670b0..c42d996a6 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift
@@ -449,6 +449,7 @@ struct AdaptiveMicButton: View {
     let inactiveColor: Color
     let icon: String
     let action: () -> Void
+    let onLongPress: (() -> Void)?
 
     init(
         isActive: Bool = false,
@@ -457,7 +458,8 @@ struct AdaptiveMicButton: View {
         activeColor: Color = .red,
         inactiveColor: Color = AppColors.primaryAccent,
         icon: String = "mic.fill",
-        action: @escaping () -> Void
+        action: @escaping () -> Void,
+        onLongPress: (() -> Void)? = nil
     ) {
         self.isActive = isActive
         self.isPulsing = isPulsing
@@ -466,83 +468,55 @@ struct AdaptiveMicButton: View {
         self.inactiveColor = inactiveColor
         self.icon = icon
         self.action = action
+        self.onLongPress = onLongPress
+    }
+
+    private var micContent: some View {
+        ZStack {
+            // Background circle
+            Circle()
+                .fill(isActive ? activeColor : inactiveColor)
+                .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
+
+            // Pulsing effect when active
+            if isPulsing {
+                Circle()
+                    .stroke(Color.white.opacity(0.4), lineWidth: 2)
+                    .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
+                    .scaleEffect(1.3)
+                    .opacity(0)
+                    .animation(
+                        .easeOut(duration: 1.0).repeatForever(autoreverses: false),
+                        value: isPulsing
+                    )
+            }
+
+            // Icon or loading indicator
+            if isLoading {
+                ProgressView()
+                    .progressViewStyle(CircularProgressViewStyle(tint: .white))
+                    .scaleEffect(1.2)
+            } else {
+                Image(systemName: icon)
+                    .font(.system(size: AdaptiveSizing.micIconSize))
+                    .foregroundColor(.white)
+                    .contentTransition(.symbolEffect(.replace))
+                    .animation(.smooth(duration: 0.3), value: icon)
+            }
+        }
     }
 
     var body: some View {
         Group {
             if #available(iOS 26.0, macOS 26.0, *) {
-                Button(action: action) {
-                    ZStack {
-                        // Background circle
-                        Circle()
-                            .fill(isActive ? activeColor : inactiveColor)
-                            .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
-
-                        // Pulsing effect when active
-                        if isPulsing {
-                            Circle()
-                                .stroke(Color.white.opacity(0.4), lineWidth: 2)
-                                .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
-                                .scaleEffect(1.3)
-                                .opacity(0)
-                                .animation(
-                                    .easeOut(duration: 1.0).repeatForever(autoreverses: false),
-                                    value: isPulsing
-                                )
-                        }
-
-                        // Icon or loading indicator
-                        if isLoading {
-                            ProgressView()
-                                .progressViewStyle(CircularProgressViewStyle(tint: .white))
-                                .scaleEffect(1.2)
-                        } else {
-                            Image(systemName: icon)
-                                .font(.system(size: AdaptiveSizing.micIconSize))
-                                .foregroundColor(.white)
-                                .contentTransition(.symbolEffect(.replace))
-                                .animation(.smooth(duration: 0.3), value: icon)
-                        }
-                    }
-                }
-                .buttonStyle(.plain)
-                .glassEffect(.regular.interactive())
+                micContent
+                    .onLongPressGesture(minimumDuration: 0.5, perform: { onLongPress?() ?? action() })
+                    .onTapGesture(perform: action)
+                    .glassEffect(.regular.interactive())
             } else {
-                Button(action: action) {
-                    ZStack {
-                        // Background circle
-                        Circle()
-                            .fill(isActive ? activeColor : inactiveColor)
-                            .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
-
-                        // Pulsing effect when active
-                        if isPulsing {
-                            Circle()
-                                .stroke(Color.white.opacity(0.4), lineWidth: 2)
-                                .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
-                                .scaleEffect(1.3)
-                                .opacity(0)
-                                .animation(
-                                    .easeOut(duration: 1.0).repeatForever(autoreverses: false),
-                                    value: isPulsing
-                                )
-                        }
-
-                        // Icon or loading indicator
-                        if isLoading {
-                            ProgressView()
-                                .progressViewStyle(CircularProgressViewStyle(tint: .white))
-                                .scaleEffect(1.2)
-                        } else {
-                            Image(systemName: icon)
-                                .font(.system(size: AdaptiveSizing.micIconSize))
-                                .foregroundColor(.white)
-                                .contentTransition(.symbolEffect(.replace))
-                                .animation(.smooth(duration: 0.3), value: icon)
-                        }
-                    }
-                }
-                .buttonStyle(.plain)
+                micContent
+                    .onLongPressGesture(minimumDuration: 0.5, perform: { onLongPress?() ?? action() })
+                    .onTapGesture(perform: action)
             }
         }
     }
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
index 76218a7ae..fe1f57524 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -679,6 +679,8 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
     std::string partial_utf8_buffer;
     partial_utf8_buffer.reserve(8);
 
+    Utf8State scanner_state;
+
     int n_cur = batch.n_tokens;
     int tokens_generated = 0;
     bool stop_sequence_hit = false;
@@ -696,11 +698,11 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
         const std::string new_token_chars =
             common_token_to_piece(context_, new_token_id);
 
+        const size_t old_partial_size = partial_utf8_buffer.size();
         partial_utf8_buffer.append(new_token_chars);
 
-        Utf8State scanner_state;
         size_t valid_upto = 0;
-        for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
+        for (size_t i = old_partial_size; i < partial_utf8_buffer.size(); ++i) {
             scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
             if (scanner_state.state == 0) {
                 valid_upto = i + 1;
@@ -735,12 +737,17 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
 
             if (stop_window.size() > MAX_STOP_LEN) {
                 size_t safe_len = stop_window.size() - MAX_STOP_LEN;
-                if (!callback(stop_window.substr(0, safe_len))) {
-                    LOGI("Generation cancelled by callback");
-                    cancel_requested_.store(true);
-                    break;
+                while (safe_len > 0 && (stop_window[safe_len] & 0xC0) == 0x80) {
+                    safe_len--;
+                }
+                if (safe_len > 0) {
+                    if (!callback(stop_window.substr(0, safe_len))) {
+                        LOGI("Generation cancelled by callback");
+                        cancel_requested_.store(true);
+                        break;
+                    }
+                    stop_window.erase(0, safe_len);
                 }
-                stop_window.erase(0, safe_len);
             }
         }
 
@@ -973,6 +980,8 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen
     std::string partial_utf8_buffer;
     partial_utf8_buffer.reserve(8);
 
+    Utf8State scanner_state;
+
     std::string generated_text;
     int n_cur = static_cast<int>(current_pos) + n_prompt;
     int tokens_generated = 0;
@@ -987,38 +996,17 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen
         }
 
         const std::string new_token_chars = common_token_to_piece(context_, new_token_id);
+        const size_t old_partial_size = partial_utf8_buffer.size();
         partial_utf8_buffer.append(new_token_chars);
 
-        struct Utf8Check {
-            static size_t valid_upto(const std::string& buf) {
-                static const uint8_t utf8d[] = {
-                    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-                    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-                    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-                    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-                    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
-                    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
-                    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-                    0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3,
-                    0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,
-                    0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1,
-                    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,
-                    1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,
-                    1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1,
-                    1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-                };
-                uint32_t state = 0;
-                size_t upto = 0;
-                for (size_t i = 0; i < buf.size(); ++i) {
-                    uint32_t type = utf8d[static_cast<uint8_t>(buf[i])];
-                    state = utf8d[256 + state * 16 + type];
-                    if (state == 0) upto = i + 1;
-                }
-                return upto;
+        size_t valid_upto = 0;
+        for (size_t i = old_partial_size; i < partial_utf8_buffer.size(); ++i) {
+            scanner_state.process(static_cast<uint8_t>(partial_utf8_buffer[i]));
+            if (scanner_state.state == 0) {
+                valid_upto = i + 1;
             }
-        };
+        }
 
-        const size_t valid_upto = Utf8Check::valid_upto(partial_utf8_buffer);
         if (valid_upto > 0) {
             std::string valid_chunk = partial_utf8_buffer.substr(0, valid_upto);
             stop_window.append(valid_chunk);
@@ -1042,9 +1030,14 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen
             }
 
             if (stop_window.size() > MAX_STOP_LEN) {
-                const size_t safe_len = stop_window.size() - MAX_STOP_LEN;
-                generated_text += stop_window.substr(0, safe_len);
-                stop_window.erase(0, safe_len);
+                size_t safe_len = stop_window.size() - MAX_STOP_LEN;
+                while (safe_len > 0 && (stop_window[safe_len] & 0xC0) == 0x80) {
+                    safe_len--;
+                }
+                if (safe_len > 0) {
+                    generated_text += stop_window.substr(0, safe_len);
+                    stop_window.erase(0, safe_len);
+                }
             }
         }
 
diff --git a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
index d09f739ea..d13846b7f 100644
--- a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
+++ b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
@@ -9,6 +9,7 @@
  * Do NOT add features not present in the Swift code.
  */
 
+#include <atomic>
 #include <chrono>
 #include <cstdlib>
 #include <cstring>
@@ -45,6 +46,9 @@ struct rac_llm_component {
     /** Mutex for thread safety */
     std::mutex mtx;
 
+    /** Cancellation flag - set by cancel(), read by token callback without holding mtx */
+    std::atomic<bool> cancel_requested{false};
+
     /** Resolved inference framework (defaults to LlamaCPP, the primary LLM backend) */
     rac_inference_framework_t actual_framework;
 
@@ -509,6 +513,8 @@ struct llm_stream_context {
     float temperature;
     int32_t max_tokens;
     int32_t token_count;  // Track tokens for streaming updates
+
+    std::atomic<bool>* cancel_flag;
 };
 
 /**
@@ -517,6 +523,10 @@ struct llm_stream_context {
 static rac_bool_t llm_stream_token_callback(const char* token, void* user_data) {
     auto* ctx = reinterpret_cast<llm_stream_context*>(user_data);
 
+    if (ctx->cancel_flag && ctx->cancel_flag->load(std::memory_order_relaxed)) {
+        return RAC_FALSE;
+    }
+
     // Track first token time and emit first token event
     if (!ctx->first_token_recorded) {
         ctx->first_token_recorded = true;
@@ -576,6 +586,8 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
     auto* component = reinterpret_cast<rac_llm_component*>(handle);
     std::lock_guard<std::mutex> lock(component->mtx);
 
+    component->cancel_requested.store(false, std::memory_order_relaxed);
+
     // Generate unique ID for this generation
     std::string generation_id = generate_unique_id();
     const char* model_id = rac_lifecycle_get_model_id(component->lifecycle);
@@ -667,6 +679,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
     ctx.temperature = effective_options->temperature;
     ctx.max_tokens = effective_options->max_tokens;
     ctx.token_count = 0;
+    ctx.cancel_flag = &component->cancel_requested;
 
     // Perform streaming generation
     result = rac_llm_generate_stream(service, prompt, effective_options, llm_stream_token_callback,
@@ -702,7 +715,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
     rac_llm_result_t final_result = {};
     final_result.text = strdup(ctx.full_text.c_str());
     final_result.prompt_tokens = ctx.prompt_tokens;
-    final_result.completion_tokens = estimate_tokens(ctx.full_text.c_str());
+    final_result.completion_tokens = ctx.token_count > 0 ? ctx.token_count : estimate_tokens(ctx.full_text.c_str());
     final_result.total_tokens = final_result.prompt_tokens + final_result.completion_tokens;
     final_result.total_time_ms = total_time_ms;
 
@@ -761,7 +774,8 @@ extern "C" rac_result_t rac_llm_component_cancel(rac_handle_t handle) {
         return RAC_ERROR_INVALID_HANDLE;
 
     auto* component = reinterpret_cast<rac_llm_component*>(handle);
-    std::lock_guard<std::mutex> lock(component->mtx);
+
+    component->cancel_requested.store(true, std::memory_order_relaxed);
 
     rac_handle_t service = rac_lifecycle_get_service(component->lifecycle);
     if (service) {
diff --git a/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp b/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp
index 4212d4abb..0ca01a419 100644
--- a/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp
+++ b/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp
@@ -398,13 +398,41 @@ static bool extract_json_value(const char* json_obj, const char* key, char** out
                                         *out_is_object = true;
                                         return true;
                                     }
+                                } else {
+                                    // Scalar value (number, boolean, null)
+                                    // Read until comma, closing brace, or whitespace
+                                    size_t val_start = pos;
+                                    size_t val_end = pos;
+                                    while (val_end < len && json_obj[val_end] != ',' &&
+                                           json_obj[val_end] != '}' && json_obj[val_end] != ']' &&
+                                           json_obj[val_end] != '\n') {
+                                        val_end++;
+                                    }
+                                    // Trim trailing whitespace
+                                    while (val_end > val_start &&
+                                           (json_obj[val_end - 1] == ' ' || json_obj[val_end - 1] == '\t')) {
+                                        val_end--;
+                                    }
+                                    if (val_end > val_start) {
+                                        size_t val_len = val_end - val_start;
+                                        *out_value = static_cast<char*>(malloc(val_len + 1));
+                                        if (*out_value) {
+                                            memcpy(*out_value, json_obj + val_start, val_len);
+                                            (*out_value)[val_len] = '\0';
+                                        }
+                                        *out_is_object = false;
+                                        return true;
+                                    }
                                 }
                             }
                         }
                     }
 
                     // Move to end of key for continued scanning
+                    // Skip the in_string toggle - extract_json_string already
+                    // consumed the closing quote so in_string must stay false.
                     i = key_end - 1;
+                    continue;
                 }
             }
             in_string = !in_string;
@@ -663,10 +691,46 @@ static bool extract_tool_name_and_args(const char* json_obj, char** out_tool_nam
                     }
                 }
 
-                // No arguments found - use empty object
-                *out_args_json = static_cast<char*>(malloc(3));
-                if (*out_args_json) {
-                    std::memcpy(*out_args_json, "{}", 3);
+                // No standard argument wrapper key found.
+                // Fallback: collect all remaining keys (excluding the tool name key)
+                // as flat arguments. This handles LLM output like:
+                // {"tool": "calculate", "expression": "5 * 100"}
+                {
+                    std::vector<std::string> all_keys = get_json_keys(json_obj);
+                    std::string flat_args = "{";
+                    bool first = true;
+                    for (const auto& k : all_keys) {
+                        // Skip the key that matched the tool name
+                        bool is_tool_key = false;
+                        for (int t = 0; TOOL_NAME_KEYS[t] != nullptr; t++) {
+                            if (str_equals_ignore_case(k.c_str(), TOOL_NAME_KEYS[t])) {
+                                is_tool_key = true;
+                                break;
+                            }
+                        }
+                        if (is_tool_key) continue;
+
+                        char* kval = nullptr;
+                        bool kval_is_obj = false;
+                        if (extract_json_value(json_obj, k.c_str(), &kval, &kval_is_obj)) {
+                            if (!first) flat_args += ",";
+                            std::string escaped_key = escape_json_string(k.c_str());
+                            if (kval_is_obj) {
+                                flat_args += "\"" + escaped_key + "\":" + std::string(kval);
+                            } else if (kval) {
+                                std::string escaped_val = escape_json_string(kval);
+                                flat_args += "\"" + escaped_key + "\":\"" + escaped_val + "\"";
+                            }
+                            free(kval);
+                            first = false;
+                        }
+                    }
+                    flat_args += "}";
+
+                    *out_args_json = static_cast<char*>(malloc(flat_args.size() + 1));
+                    if (*out_args_json) {
+                        std::memcpy(*out_args_json, flat_args.c_str(), flat_args.size() + 1);
+                    }
                 }
                 return true;
             }
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift
index 09baad44d..b6b3413a8 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift
@@ -87,7 +87,7 @@ public class AudioPlaybackManager: NSObject, ObservableObject, AVAudioPlayerDele
 
     /// Stop current playback
     public func stop() {
-        guard isPlaying else { return }
+        guard audioPlayer != nil else { return }
 
         audioPlayer?.stop()
         cleanupPlayback(success: false)
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift
index 0542583be..823c513b8 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift
@@ -179,6 +179,15 @@ public enum CppBridge {
 
         guard wasInitialized else { return }
 
+        Task {
+            await LLM.shared.destroy()
+            await STT.shared.destroy()
+            await TTS.shared.destroy()
+            await VAD.shared.destroy()
+            await VoiceAgent.shared.destroy()
+            await VLM.shared.destroy()
+        }
+
         // Shutdown in reverse order
         // Note: ModelAssignment and Platform callbacks remain valid (static)
 
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift
index 3123541de..ec988e980 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift
@@ -190,7 +190,9 @@ public final class ArchiveUtility {
     /// Decompress raw deflate data using streaming compression_stream_process.
     /// Uses a small 256 KB output buffer instead of pre-allocating compressedSize * N.
     private static func decompressDeflateStreaming(_ data: Data, range: Range<Int>) throws -> Data {
-        var stream = compression_stream()
+        let placeholder = UnsafeMutablePointer<UInt8>.allocate(capacity: 1)
+        defer { placeholder.deallocate() }
+        var stream = compression_stream(dst_ptr: placeholder, dst_size: 0, src_ptr: placeholder, src_size: 0, state: nil)
         guard compression_stream_init(&stream, COMPRESSION_STREAM_DECODE, COMPRESSION_ZLIB) == COMPRESSION_STATUS_OK else {
             throw SDKError.download(.extractionFailed, "Failed to initialize decompression stream")
         }
@@ -218,7 +220,7 @@ public final class ArchiveUtility {
                 stream.dst_ptr = outputBuffer
                 stream.dst_size = outputChunkSize
 
-                status = compression_stream_process(&stream, COMPRESSION_STREAM_FINALIZE)
+                status = compression_stream_process(&stream, Int32(COMPRESSION_STREAM_FINALIZE.rawValue))
 
                 let bytesProduced = outputChunkSize - stream.dst_size
                 if bytesProduced > 0 {
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
index bc721a59c..ba4091e80 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
@@ -85,19 +85,21 @@ public extension RunAnywhere {
         let totalTimeMs = endTime.timeIntervalSince(startTime) * 1000
 
         // Extract result
-        let generatedText: String
+        let rawText: String
         if let textPtr = llmResult.text {
-            generatedText = String(cString: textPtr)
+            rawText = String(cString: textPtr)
         } else {
-            generatedText = ""
+            rawText = ""
         }
         let inputTokens = Int(llmResult.prompt_tokens)
         let outputTokens = Int(llmResult.completion_tokens)
         let tokensPerSecond = llmResult.tokens_per_second > 0 ? Double(llmResult.tokens_per_second) : 0
 
+        let (generatedText, thinkingContent) = ThinkingContentParser.extract(from: rawText)
+
         return LLMGenerationResult(
             text: generatedText,
-            thinkingContent: nil,
+            thinkingContent: thinkingContent,
             inputTokens: inputTokens,
             tokensUsed: outputTokens,
             modelUsed: modelId,
@@ -105,7 +107,7 @@ public extension RunAnywhere {
             framework: "llamacpp",
             tokensPerSecond: tokensPerSecond,
             timeToFirstTokenMs: nil,
-            thinkingTokens: 0,
+            thinkingTokens: thinkingContent.map { _ in outputTokens } ?? 0,
             responseTokens: outputTokens
         )
     }
@@ -189,47 +191,43 @@ public extension RunAnywhere {
     ) -> AsyncThrowingStream<String, Error> {
         AsyncThrowingStream<String, Error> { continuation in
             Task {
-                do {
-                    await collector.markStart()
-
-                    let context = LLMStreamCallbackContext(continuation: continuation, collector: collector)
-                    let contextPtr = Unmanaged.passRetained(context).toOpaque()
-
-                    let callbacks = LLMStreamCallbacks.create()
-                    var cOptions = options
-
-                    let callCFunction: () -> rac_result_t = {
-                        prompt.withCString { promptPtr in
-                            rac_llm_component_generate_stream(
-                                handle,
-                                promptPtr,
-                                &cOptions,
-                                callbacks.token,
-                                callbacks.complete,
-                                callbacks.error,
-                                contextPtr
-                            )
-                        }
+                await collector.markStart()
+
+                let context = LLMStreamCallbackContext(continuation: continuation, collector: collector)
+                // passRetained: context is released in completeCallback or errorCallback
+                let contextPtr = Unmanaged.passRetained(context).toOpaque()
+
+                let callbacks = LLMStreamCallbacks.create()
+                var cOptions = options
+
+                let callCFunction: () -> rac_result_t = {
+                    prompt.withCString { promptPtr in
+                        rac_llm_component_generate_stream(
+                            handle,
+                            promptPtr,
+                            &cOptions,
+                            callbacks.token,
+                            callbacks.complete,
+                            callbacks.error,
+                            contextPtr
+                        )
                     }
+                }
 
-                    let streamResult: rac_result_t
-                    if let systemPrompt = systemPrompt {
-                        streamResult = systemPrompt.withCString { sysPtr in
-                            cOptions.system_prompt = sysPtr
-                            return callCFunction()
-                        }
-                    } else {
-                        cOptions.system_prompt = nil
-                        streamResult = callCFunction()
+                let streamResult: rac_result_t
+                if let systemPrompt = systemPrompt {
+                    streamResult = systemPrompt.withCString { sysPtr in
+                        cOptions.system_prompt = sysPtr
+                        return callCFunction()
                     }
+                } else {
+                    cOptions.system_prompt = nil
+                    streamResult = callCFunction()
+                }
 
-                    if streamResult != RAC_SUCCESS {
-                        Unmanaged<LLMStreamCallbackContext>.fromOpaque(contextPtr).release()
-                        let error = SDKError.llm(.generationFailed, "Stream generation failed: \(streamResult)")
-                        continuation.finish(throwing: error)
-                        await collector.markFailed(error)
-                    }
-                } catch {
+                if streamResult != RAC_SUCCESS {
+                    Unmanaged<LLMStreamCallbackContext>.fromOpaque(contextPtr).release()
+                    let error = SDKError.llm(.generationFailed, "Stream generation failed: \(streamResult)")
                     continuation.finish(throwing: error)
                     await collector.markFailed(error)
                 }
@@ -255,6 +253,7 @@ private enum LLMStreamCallbacks {
     static func create() -> Callbacks {
         let tokenCallback: TokenFn = { tokenPtr, userData -> rac_bool_t in
             guard let tokenPtr = tokenPtr, let userData = userData else { return RAC_TRUE }
+            if Task.isCancelled { return RAC_FALSE }
             let ctx = Unmanaged<LLMStreamCallbackContext>.fromOpaque(userData).takeUnretainedValue()
             let token = String(cString: tokenPtr)
             Task {
@@ -264,16 +263,28 @@ private enum LLMStreamCallbacks {
             return RAC_TRUE
         }
 
-        let completeCallback: CompleteFn = { _, userData in
+        let completeCallback: CompleteFn = { resultPtr, userData in
             guard let userData = userData else { return }
-            let ctx = Unmanaged<LLMStreamCallbackContext>.fromOpaque(userData).takeUnretainedValue()
+            let ctx = Unmanaged<LLMStreamCallbackContext>.fromOpaque(userData).takeRetainedValue()
             ctx.continuation.finish()
-            Task { await ctx.collector.markComplete() }
+
+            if let result = resultPtr?.pointee {
+                Task {
+                    await ctx.collector.markCompleteWithMetrics(
+                        promptTokens: Int(result.prompt_tokens),
+                        completionTokens: Int(result.completion_tokens),
+                        tokensPerSecond: Double(result.tokens_per_second),
+                        timeToFirstTokenMs: Double(result.time_to_first_token_ms)
+                    )
+                }
+            } else {
+                Task { await ctx.collector.markComplete() }
+            }
         }
 
         let errorCallback: ErrorFn = { _, errorMsg, userData in
             guard let userData = userData else { return }
-            let ctx = Unmanaged<LLMStreamCallbackContext>.fromOpaque(userData).takeUnretainedValue()
+            let ctx = Unmanaged<LLMStreamCallbackContext>.fromOpaque(userData).takeRetainedValue()
             let message = errorMsg.map { String(cString: $0) } ?? "Unknown error"
             let error = SDKError.llm(.generationFailed, message)
             ctx.continuation.finish(throwing: error)
@@ -296,6 +307,34 @@ private final class LLMStreamCallbackContext: @unchecked Sendable {
     }
 }
 
+// MARK: - Thinking Content Parser
+
+enum ThinkingContentParser {
+    /// Extracts `<think>...</think>` content from generated text.
+    /// - Returns: Tuple of (responseText, thinkingContent). If no tags found, responseText = original text, thinkingContent = nil.
+    static func extract(from text: String) -> (text: String, thinking: String?) {
+        guard let startRange = text.range(of: "<think>"),
+              let endRange = text.range(of: "</think>"),
+              startRange.upperBound <= endRange.lowerBound else {
+            return (text: text, thinking: nil)
+        }
+        let thinkingContent = String(text[startRange.upperBound..<endRange.lowerBound])
+            .trimmingCharacters(in: .whitespacesAndNewlines)
+        // Include any text before <think> and after </think>
+        let textBefore = String(text[..<startRange.lowerBound])
+            .trimmingCharacters(in: .whitespacesAndNewlines)
+        let textAfter = String(text[endRange.upperBound...])
+            .trimmingCharacters(in: .whitespacesAndNewlines)
+        let responseText = [textBefore, textAfter]
+            .filter { !$0.isEmpty }
+            .joined(separator: "\n")
+        return (
+            text: responseText,
+            thinking: thinkingContent.isEmpty ? nil : thinkingContent
+        )
+    }
+}
+
 // MARK: - Streaming Metrics Collector
 
 /// Internal actor for collecting streaming metrics
@@ -312,6 +351,11 @@ private actor LLMStreamingMetricsCollector {
     private var error: Error?
     private var resultContinuation: CheckedContinuation<LLMGenerationResult, Error>?
 
+    private var cppPromptTokens: Int?
+    private var cppCompletionTokens: Int?
+    private var cppTokensPerSecond: Double?
+    private var cppTimeToFirstTokenMs: Double?
+
     init(modelId: String, promptLength: Int) {
         self.modelId = modelId
         self.promptLength = promptLength
@@ -339,6 +383,24 @@ private actor LLMStreamingMetricsCollector {
         }
     }
 
+    func markCompleteWithMetrics(
+        promptTokens: Int,
+        completionTokens: Int,
+        tokensPerSecond: Double,
+        timeToFirstTokenMs: Double
+    ) {
+        if promptTokens > 0 { cppPromptTokens = promptTokens }
+        if completionTokens > 0 { cppCompletionTokens = completionTokens }
+        if tokensPerSecond > 0 { cppTokensPerSecond = tokensPerSecond }
+        if timeToFirstTokenMs > 0 { cppTimeToFirstTokenMs = timeToFirstTokenMs }
+
+        isComplete = true
+        if let continuation = resultContinuation {
+            continuation.resume(returning: buildResult())
+            resultContinuation = nil
+        }
+    }
+
     func markFailed(_ error: Error) {
         self.error = error
         if let continuation = resultContinuation {
@@ -363,20 +425,32 @@ private actor LLMStreamingMetricsCollector {
         let endTime = Date()
         let latencyMs = (startTime.map { endTime.timeIntervalSince($0) } ?? 0) * 1000
 
-        var timeToFirstTokenMs: Double?
-        if let start = startTime, let firstToken = firstTokenTime {
+        let timeToFirstTokenMs: Double?
+        if let cppTtft = cppTimeToFirstTokenMs {
+            timeToFirstTokenMs = cppTtft
+        } else if let start = startTime, let firstToken = firstTokenTime {
             timeToFirstTokenMs = firstToken.timeIntervalSince(start) * 1000
+        } else {
+            timeToFirstTokenMs = nil
         }
 
-        // Use actual token count from streaming callbacks, not character estimation (fixes #339)
-        let outputTokens = max(1, tokenCount)
-        let totalTimeSec = latencyMs / 1000.0
-        let tokensPerSecond = totalTimeSec > 0 ? Double(outputTokens) / totalTimeSec : 0
+        let outputTokens = cppCompletionTokens ?? max(1, tokenCount)
+        let inputTokens = cppPromptTokens ?? 0
+
+        let tokensPerSecond: Double
+        if let cppTps = cppTokensPerSecond {
+            tokensPerSecond = cppTps
+        } else {
+            let totalTimeSec = latencyMs / 1000.0
+            tokensPerSecond = totalTimeSec > 0 ? Double(outputTokens) / totalTimeSec : 0
+        }
+
+        let (responseText, thinkingContent) = ThinkingContentParser.extract(from: fullText)
 
         return LLMGenerationResult(
-            text: fullText,
-            thinkingContent: nil,
-            inputTokens: 0,
+            text: responseText,
+            thinkingContent: thinkingContent,
+            inputTokens: inputTokens,
             tokensUsed: outputTokens,
             modelUsed: modelId,
             latencyMs: latencyMs,
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift
index 2a8481f7c..eef87c382 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift
@@ -168,8 +168,17 @@ public extension RunAnywhere {
         let registeredTools = await ToolRegistry.shared.getAll()
         let tools = opts.tools ?? registeredTools
 
+        // Extract /no_think prefix before building the full prompt so it stays
+        // at the beginning where the C++ inference layer expects it.
+        let noThinkPrefix = "/no_think\n"
+        let hasNoThink = prompt.hasPrefix(noThinkPrefix)
+        let cleanPrompt = hasNoThink ? String(prompt.dropFirst(noThinkPrefix.count)) : prompt
+
         let systemPrompt = buildToolSystemPrompt(tools: tools, options: opts)
-        var fullPrompt = systemPrompt.isEmpty ? prompt : "\(systemPrompt)\n\nUser: \(prompt)"
+        var fullPrompt = systemPrompt.isEmpty ? cleanPrompt : "\(systemPrompt)\n\nUser: \(cleanPrompt)"
+        if hasNoThink {
+            fullPrompt = "\(noThinkPrefix)\(fullPrompt)"
+        }
 
         var allToolCalls: [ToolCall] = []
         var allToolResults: [ToolResult] = []
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift
index cda94d5c6..7d3dec55c 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift
@@ -95,6 +95,7 @@ public extension RunAnywhere {
 
         // Synthesize (C++ emits events)
         var ttsResult = rac_tts_result_t()
+        defer { rac_tts_result_free(&ttsResult) }
         let synthesizeResult = text.withCString { textPtr in
             rac_tts_component_synthesize(handle, textPtr, &cOptions, &ttsResult)
         }
@@ -157,7 +158,6 @@ public extension RunAnywhere {
 
         let voiceId = await CppBridge.TTS.shared.currentVoiceId ?? "unknown"
         let startTime = Date()
-        var totalAudioData = Data()
 
         // Build C options
         var cOptions = rac_tts_options_t()
@@ -166,8 +166,8 @@ public extension RunAnywhere {
         cOptions.volume = options.volume
         cOptions.sample_rate = Int32(options.sampleRate)
 
-        // Create callback context
-        let context = TTSStreamContext(onChunk: onAudioChunk, totalData: &totalAudioData)
+        // Create callback context - owns its own Data
+        let context = TTSStreamContext(onChunk: onAudioChunk)
         let contextPtr = Unmanaged.passRetained(context).toOpaque()
 
         let streamResult = text.withCString { textPtr in
@@ -180,13 +180,14 @@ public extension RunAnywhere {
                     let ctx = Unmanaged<TTSStreamContext>.fromOpaque(userData).takeUnretainedValue()
                     let chunk = Data(bytes: audioPtr, count: audioSize)
                     ctx.onChunk(chunk)
-                    ctx.totalData.pointee.append(chunk)
+                    ctx.totalData.append(chunk)
                 },
                 contextPtr
             )
         }
 
-        Unmanaged<TTSStreamContext>.fromOpaque(contextPtr).release()
+        let finalContext = Unmanaged<TTSStreamContext>.fromOpaque(contextPtr).takeRetainedValue()
+        let totalAudioData = finalContext.totalData
 
         guard streamResult == RAC_SUCCESS else {
             throw SDKError.tts(.processingFailed, "Streaming synthesis failed: \(streamResult)")
@@ -309,10 +310,9 @@ public extension RunAnywhere {
 
 private final class TTSStreamContext: @unchecked Sendable {
     let onChunk: (Data) -> Void
-    var totalData: UnsafeMutablePointer<Data>
+    var totalData: Data = Data()
 
-    init(onChunk: @escaping (Data) -> Void, totalData: UnsafeMutablePointer<Data>) {
+    init(onChunk: @escaping (Data) -> Void) {
         self.onChunk = onChunk
-        self.totalData = totalData
     }
 }
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift
index f96978079..d61fb8cbb 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift
@@ -257,10 +257,14 @@ public extension RunAnywhere {
             rac_voice_agent_synthesize_speech(handle, textPtr, &audioPtr, &audioSize)
         }
 
-        guard result == RAC_SUCCESS, let ptr = audioPtr, audioSize > 0 else {
+        guard result == RAC_SUCCESS else {
             throw SDKError.voiceAgent(.processingFailed, "Speech synthesis failed: \(result)")
         }
 
+        guard let ptr = audioPtr, audioSize > 0 else {
+            return Data()
+        }
+
         let audioData = Data(bytes: ptr, count: audioSize)
         free(ptr)
 
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift
index 091842093..8e143cca9 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift
@@ -108,6 +108,10 @@ public actor VoiceSessionHandle {
         eventContinuation?.finish()
     }
 
+    public func interruptPlayback() {
+        audioPlayback.stop()
+    }
+
     /// Force process current audio (push-to-talk)
     public func sendNow() async {
         guard isRunning else { return }
@@ -115,6 +119,12 @@ public actor VoiceSessionHandle {
         await processCurrentAudio()
     }
 
+    /// Resume listening after a completed turn (for push-to-talk when continuousMode is false)
+    public func resumeListening() async {
+        guard isRunning else { return }
+        try? await startListening()
+    }
+
     // MARK: - Private
 
     private func emit(_ event: VoiceSessionEvent) {
@@ -196,44 +206,68 @@ public actor VoiceSessionHandle {
 
         emit(.processing)
 
+        var transcription = ""
+        var cleanedResponse = ""
+        var thinkingContent: String?
+        var synthesizedAudio: Data?
+
         do {
-            let result = try await RunAnywhere.processVoiceTurn(audio)
+            // Step 1: Transcribe audio
+            transcription = try await RunAnywhere.voiceAgentTranscribe(audio)
 
-            guard result.speechDetected else {
-                logger.info("No speech detected")
+            guard !transcription.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
+                logger.info("No speech detected (empty transcription)")
+                emit(.turnCompleted(transcript: "", response: "", thinkingContent: nil, audio: nil))
                 if config.continuousMode && isRunning {
                     try? await startListening()
                 }
                 return
             }
 
-            // Emit intermediate results
-            if let transcript = result.transcription {
-                emit(.transcribed(text: transcript))
-            }
+            emit(.transcribed(text: transcription))
 
-            if let response = result.response {
-                emit(.responded(text: response))
+            // Step 2: Generate LLM response (apply /no_think prefix if needed)
+            let effectivePrompt: String
+            if !config.thinkingModeEnabled {
+                effectivePrompt = "/no_think\n\(transcription)"
+            } else {
+                effectivePrompt = transcription
             }
 
-            // Play TTS if enabled
-            if config.autoPlayTTS, let ttsAudio = result.synthesizedAudio, !ttsAudio.isEmpty {
-                emit(.speaking)
-                try await audioPlayback.play(ttsAudio)
+            let options = LLMGenerationOptions(maxTokens: config.maxTokens ?? 100)
+            let result = try await RunAnywhere.generate(effectivePrompt, options: options)
+            // generate() already runs ThinkingContentParser internally
+            cleanedResponse = result.text
+            thinkingContent = result.thinkingContent
+
+            emit(.responded(text: cleanedResponse, thinkingContent: thinkingContent))
+
+            // Step 4: Synthesize speech from cleaned response (no think tags spoken)
+            if config.autoPlayTTS, !cleanedResponse.isEmpty {
+                let ttsAudio = try await RunAnywhere.voiceAgentSynthesizeSpeech(cleanedResponse)
+                synthesizedAudio = ttsAudio
+
+                if !ttsAudio.isEmpty {
+                    emit(.speaking)
+                    do {
+                        try await audioPlayback.play(ttsAudio)
+                    } catch is AudioPlaybackError {
+                        logger.info("TTS playback interrupted by user")
+                    }
+                }
             }
-
-            // Emit complete result
-            emit(.turnCompleted(
-                transcript: result.transcription ?? "",
-                response: result.response ?? "",
-                audio: result.synthesizedAudio
-            ))
-
         } catch {
             logger.error("Processing failed: \(error)")
             emit(.error(error.localizedDescription))
         }
 
+        emit(.turnCompleted(
+            transcript: transcription,
+            response: cleanedResponse,
+            thinkingContent: thinkingContent,
+            audio: synthesizedAudio
+        ))
+
         // Resume listening if continuous mode
         if config.continuousMode && isRunning {
             try? await startListening()
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift
index da6c8ff1e..47f30a0d4 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift
@@ -23,6 +23,9 @@ public struct VoiceAgentResult: Sendable {
     /// Generated response text from LLM
     public var response: String?
 
+    /// Thinking content extracted from `<think>...</think>` tags (nil if none)
+    public var thinkingContent: String?
+
     /// Synthesized audio data from TTS
     public var synthesizedAudio: Data?
 
@@ -31,11 +34,13 @@ public struct VoiceAgentResult: Sendable {
         speechDetected: Bool = false,
         transcription: String? = nil,
         response: String? = nil,
+        thinkingContent: String? = nil,
         synthesizedAudio: Data? = nil
     ) {
         self.speechDetected = speechDetected
         self.transcription = transcription
         self.response = response
+        self.thinkingContent = thinkingContent
         self.synthesizedAudio = synthesizedAudio
     }
 
@@ -185,14 +190,14 @@ public enum VoiceSessionEvent: Sendable {
     /// Got transcription from STT
     case transcribed(text: String)
 
-    /// Got response from LLM
-    case responded(text: String)
+    /// Got response from LLM (with optional thinking content)
+    case responded(text: String, thinkingContent: String? = nil)
 
     /// Playing TTS audio
     case speaking
 
-    /// Complete turn result
-    case turnCompleted(transcript: String, response: String, audio: Data?)
+    /// Complete turn result (with optional thinking content)
+    case turnCompleted(transcript: String, response: String, thinkingContent: String? = nil, audio: Data?)
 
     /// Session stopped
     case stopped
@@ -217,16 +222,26 @@ public struct VoiceSessionConfig: Sendable {
     /// Whether to auto-resume listening after TTS playback
     public var continuousMode: Bool
 
+    /// Whether thinking mode is enabled for the LLM.
+    public var thinkingModeEnabled: Bool
+
+    /// Maximum tokens for LLM generation (nil uses SDK default of 100)
+    public var maxTokens: Int?
+
     public init(
         silenceDuration: TimeInterval = 1.5,
         speechThreshold: Float = 0.1,
         autoPlayTTS: Bool = true,
-        continuousMode: Bool = true
+        continuousMode: Bool = true,
+        thinkingModeEnabled: Bool = false,
+        maxTokens: Int? = nil
     ) {
         self.silenceDuration = silenceDuration
         self.speechThreshold = speechThreshold
         self.autoPlayTTS = autoPlayTTS
         self.continuousMode = continuousMode
+        self.thinkingModeEnabled = thinkingModeEnabled
+        self.maxTokens = maxTokens
     }
 
     /// Default configuration