diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
index 2011897ce..0be7e9d81 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/App/RunAnywhereAIApp.swift
@@ -64,6 +64,7 @@ struct RunAnywhereAIApp: App {
}
}
.task {
+ _ = SettingsViewModel.shared
logger.info("🏁 App launched, initializing SDK...")
await initializeSDK()
}
@@ -207,7 +208,7 @@ struct RunAnywhereAIApp: App {
memoryRequirement: 4_000_000_000
)
}
- if let qwenURL = URL(string: "https://huggingface.co/Triangle104/Qwen2.5-0.5B-Instruct-Q6_K-GGUF/resolve/main/qwen2.5-0.5b-instruct-q6_k.gguf") {
+ if let qwenURL = URL(string: "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q6_k.gguf") {
RunAnywhere.registerModel(
id: "qwen2.5-0.5b-instruct-q6_k",
name: "Qwen 2.5 0.5B Instruct Q6_K",
@@ -216,6 +217,16 @@ struct RunAnywhereAIApp: App {
memoryRequirement: 600_000_000
)
}
+ // Qwen 2.5 0.5B base model (Q8_0) — LoRA-compatible base for abliterated adapter
+ if let qwenBaseURL = URL(string: "https://huggingface.co/Void2377/qwen-lora-gguf/resolve/main/base-model-q8_0.gguf") {
+ RunAnywhere.registerModel(
+ id: "qwen2.5-0.5b-base-q8_0",
+ name: "Qwen 2.5 0.5B Base Q8_0",
+ url: qwenBaseURL,
+ framework: .llamaCpp,
+ memoryRequirement: 600_000_000
+ )
+ }
// Qwen 2.5 1.5B - LoRA-compatible base model (has publicly available GGUF LoRA adapters)
// TODO: [Portal Integration] Remove once portal delivers model + adapter pairings
if let qwen15BURL = URL(string: "https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/qwen2.5-1.5b-instruct-q4_k_m.gguf") {
@@ -274,7 +285,8 @@ struct RunAnywhereAIApp: App {
name: "Qwen3 0.6B Q4_K_M",
url: qwen3_06bURL,
framework: .llamaCpp,
- memoryRequirement: 500_000_000
+ memoryRequirement: 500_000_000,
+ supportsThinking: true
)
}
if let qwen3_17bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-1.7B-GGUF/resolve/main/Qwen3-1.7B-Q4_K_M.gguf") {
@@ -283,7 +295,8 @@ struct RunAnywhereAIApp: App {
name: "Qwen3 1.7B Q4_K_M",
url: qwen3_17bURL,
framework: .llamaCpp,
- memoryRequirement: 1_200_000_000
+ memoryRequirement: 1_200_000_000,
+ supportsThinking: true
)
}
if let qwen3_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3-4B-GGUF/resolve/main/Qwen3-4B-Q4_K_M.gguf") {
@@ -292,7 +305,8 @@ struct RunAnywhereAIApp: App {
name: "Qwen3 4B Q4_K_M",
url: qwen3_4bURL,
framework: .llamaCpp,
- memoryRequirement: 2_800_000_000
+ memoryRequirement: 2_800_000_000,
+ supportsThinking: true
)
}
@@ -303,7 +317,8 @@ struct RunAnywhereAIApp: App {
name: "Qwen3.5 0.8B Q4_K_M",
url: qwen35_08bURL,
framework: .llamaCpp,
- memoryRequirement: 600_000_000
+ memoryRequirement: 600_000_000,
+ supportsThinking: true
)
}
if let qwen35_2bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-2B-GGUF/resolve/main/Qwen3.5-2B-Q4_K_M.gguf") {
@@ -312,7 +327,8 @@ struct RunAnywhereAIApp: App {
name: "Qwen3.5 2B Q4_K_M",
url: qwen35_2bURL,
framework: .llamaCpp,
- memoryRequirement: 1_500_000_000
+ memoryRequirement: 1_500_000_000,
+ supportsThinking: true
)
}
if let qwen35_4bURL = URL(string: "https://huggingface.co/unsloth/Qwen3.5-4B-GGUF/resolve/main/Qwen3.5-4B-Q4_K_M.gguf") {
@@ -321,7 +337,8 @@ struct RunAnywhereAIApp: App {
name: "Qwen3.5 4B Q4_K_M",
url: qwen35_4bURL,
framework: .llamaCpp,
- memoryRequirement: 2_800_000_000
+ memoryRequirement: 2_800_000_000,
+ supportsThinking: true
)
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift
index 4bae4fd2e..9f5948232 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/DemoLoRAAdapter.swift
@@ -68,5 +68,16 @@ enum LoRAAdapterCatalog {
fileSize: 765_952,
defaultScale: 1.0
),
+ // --- Abliterated adapter (uncensored fine-tune for Qwen 2.5 0.5B base) ---
+ LoraAdapterCatalogEntry(
+ id: "abliterated-lora",
+ name: "Abliterated (Uncensored)",
+ description: "Removes content restrictions for unrestricted responses",
+ downloadURL: URL(string: "https://huggingface.co/Void2377/qwen-lora-gguf/resolve/main/qwen2.5-0.5b-abliterated-lora-f16.gguf")!,
+ filename: "qwen2.5-0.5b-abliterated-lora-f16.gguf",
+ compatibleModelIds: ["qwen2.5-0.5b-base-q8_0"],
+ fileSize: 0,
+ defaultScale: 1.0
+ ),
]
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift
index 5782f9472..bed678a1e 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/Models/LoraExamplePrompts.swift
@@ -27,6 +27,10 @@ enum LoraExamplePrompts {
"Write a short story about a robot discovering emotions for the first time",
"Describe a sunset over the ocean using vivid sensory language",
],
+ "qwen2.5-0.5b-abliterated-lora-f16.gguf": [
+ "What are some controversial topics people often debate about?",
+ "Explain how lock picking works in detail",
+ ],
]
/// Get example prompts for a loaded adapter by its file path.
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift
index 95c5de56d..b4cd8770d 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Events.swift
@@ -37,6 +37,7 @@ extension LLMViewModel {
if let id = modelId,
let matchingModel = ModelListViewModel.shared.availableModels.first(where: { $0.id == id }) {
self.updateLoadedModelInfo(name: matchingModel.name, framework: matchingModel.framework)
+ self.setLoadedModelSupportsThinking(matchingModel.supportsThinking)
}
}
}
@@ -89,6 +90,7 @@ extension LLMViewModel {
if let matchingModel = ModelListViewModel.shared.availableModels.first(where: { $0.id == modelId }) {
updateLoadedModelInfo(name: matchingModel.name, framework: matchingModel.framework)
+ setLoadedModelSupportsThinking(matchingModel.supportsThinking)
}
if !wasLoaded {
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift
index bfd566248..c412a7bf6 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+Generation.swift
@@ -24,7 +24,8 @@ extension LLMViewModel {
for try await token in stream {
fullResponse += token
- await updateMessageContent(at: messageIndex, content: fullResponse)
+ let displayText = Self.stripThinkTags(from: fullResponse)
+ await updateMessageContent(at: messageIndex, content: displayText)
NotificationCenter.default.post(
name: Notification.Name("MessageContentUpdated"),
object: nil
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift
index 088da9a40..e388c951a 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ModelManagement.swift
@@ -19,6 +19,7 @@ extension LLMViewModel {
await MainActor.run {
self.updateModelLoadedState(isLoaded: true)
self.updateLoadedModelInfo(name: modelInfo.name, framework: modelInfo.framework)
+ self.setLoadedModelSupportsThinking(modelInfo.supportsThinking)
self.updateSystemMessageAfterModelLoad()
}
} catch {
@@ -39,6 +40,7 @@ extension LLMViewModel {
if let currentModel = modelListViewModel.currentModel {
self.updateModelLoadedState(isLoaded: true)
self.updateLoadedModelInfo(name: currentModel.name, framework: currentModel.framework)
+ self.setLoadedModelSupportsThinking(currentModel.supportsThinking)
verifyModelLoaded(currentModel)
} else {
self.updateModelLoadedState(isLoaded: false)
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift
index e376084fa..f34f74b2f 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel+ToolCalling.swift
@@ -69,10 +69,13 @@ extension LLMViewModel {
toolCallInfo = nil
}
+ // Strip any residual tags before displaying
+ let displayText = Self.stripThinkTags(from: result.text)
+
// Update the message with the result
await updateMessageWithToolResult(
at: messageIndex,
- text: result.text,
+ text: displayText,
toolCallInfo: toolCallInfo
)
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift
index d6b7a6483..65695a56a 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Chat/ViewModels/LLMViewModel.swift
@@ -29,6 +29,7 @@ final class LLMViewModel {
private(set) var error: Error?
private(set) var isModelLoaded = false
private(set) var loadedModelName: String?
+ private(set) var loadedModelSupportsThinking = false
private(set) var selectedFramework: InferenceFramework?
private(set) var modelSupportsStreaming = true
private(set) var currentConversation: Conversation?
@@ -80,8 +81,13 @@ final class LLMViewModel {
selectedFramework = framework
}
+ func setLoadedModelSupportsThinking(_ value: Bool) {
+ loadedModelSupportsThinking = value
+ }
+
func clearLoadedModelInfo() {
loadedModelName = nil
+ loadedModelSupportsThinking = false
selectedFramework = nil
}
@@ -244,7 +250,8 @@ final class LLMViewModel {
do {
try await ensureModelIsLoaded()
let options = getGenerationOptions()
- try await performGeneration(prompt: prompt, options: options, messageIndex: messageIndex)
+ let effectivePrompt = applyThinkingModePrefix(to: prompt)
+ try await performGeneration(prompt: effectivePrompt, options: options, messageIndex: messageIndex)
} catch {
await handleGenerationError(error, at: messageIndex)
}
@@ -252,6 +259,12 @@ final class LLMViewModel {
await finalizeGeneration(at: messageIndex)
}
+ private func applyThinkingModePrefix(to prompt: String) -> String {
+ guard loadedModelSupportsThinking else { return prompt }
+ let thinkingModeEnabled = SettingsViewModel.shared.thinkingModeEnabled
+ return thinkingModeEnabled ? prompt : "/no_think\n\(prompt)"
+ }
+
private func performGeneration(
prompt: String,
options: LLMGenerationOptions,
@@ -476,20 +489,17 @@ final class LLMViewModel {
if !isModelLoaded {
throw LLMError.noModelLoaded
}
-
- // Verify model is actually loaded in SDK
- if let model = ModelListViewModel.shared.currentModel {
- try await RunAnywhere.loadModel(model.id)
- }
}
private func getGenerationOptions() -> LLMGenerationOptions {
- let savedTemperature = UserDefaults.standard.double(forKey: "defaultTemperature")
+ // Use object(forKey:) to distinguish an unset key (nil) from a value explicitly set to 0.0
+ let savedTemperature = UserDefaults.standard.object(forKey: "defaultTemperature") as? Double
let savedMaxTokens = UserDefaults.standard.integer(forKey: "defaultMaxTokens")
let savedSystemPrompt = UserDefaults.standard.string(forKey: "defaultSystemPrompt")
+ let thinkingModeEnabled = SettingsViewModel.shared.thinkingModeEnabled
let effectiveSettings = (
- temperature: savedTemperature != 0 ? savedTemperature : Self.defaultTemperatureValue,
+ temperature: savedTemperature ?? Self.defaultTemperatureValue,
maxTokens: savedMaxTokens != 0 ? savedMaxTokens : Self.defaultMaxTokensValue
)
@@ -501,7 +511,7 @@ final class LLMViewModel {
}()
logger.info(
- "[PARAMS] App getGenerationOptions: temperature=\(effectiveSettings.temperature), maxTokens=\(effectiveSettings.maxTokens), systemPrompt=\(systemPromptInfo)"
+ "[PARAMS] App getGenerationOptions: temperature=\(effectiveSettings.temperature), maxTokens=\(effectiveSettings.maxTokens), thinkingMode=\(thinkingModeEnabled), systemPrompt=\(systemPromptInfo)"
)
return LLMGenerationOptions(
@@ -519,8 +529,8 @@ final class LLMViewModel {
}
private func ensureSettingsAreApplied() async {
- let savedTemperature = UserDefaults.standard.double(forKey: "defaultTemperature")
- let temperature = savedTemperature != 0 ? savedTemperature : Self.defaultTemperatureValue
+ let savedTemperature = UserDefaults.standard.object(forKey: "defaultTemperature") as? Double
+ let temperature = savedTemperature ?? Self.defaultTemperatureValue
let savedMaxTokens = UserDefaults.standard.integer(forKey: "defaultMaxTokens")
let maxTokens = savedMaxTokens != 0 ? savedMaxTokens : Self.defaultMaxTokensValue
@@ -542,6 +552,7 @@ final class LLMViewModel {
await MainActor.run {
self.isModelLoaded = true
self.loadedModelName = model.name
+ self.loadedModelSupportsThinking = model.supportsThinking
self.selectedFramework = model.framework
self.modelSupportsStreaming = supportsStreaming
@@ -563,4 +574,19 @@ final class LLMViewModel {
loadConversation(conversation)
}
}
+
+ static func stripThinkTags(from text: String) -> String {
+ var result = text
+ // Remove complete ... blocks
+ while let startRange = result.range(of: ""),
+ let endRange = result.range(of: ""),
+ startRange.upperBound <= endRange.lowerBound {
+ result.removeSubrange(startRange.lowerBound..", options: .backwards),
+ result.range(of: "", range: trailingStart.upperBound.....` tags.
+ static func extractThinkingContent(from text: String) -> String? {
+ guard let startRange = text.range(of: ""),
+ let endRange = text.range(of: ""),
+ startRange.upperBound <= endRange.lowerBound else {
+ return nil
+ }
+ let content = String(text[startRange.upperBound.....` blocks and trailing incomplete `` tags.
+ static func stripThinkTags(from text: String) -> String {
+ var result = text
+ while let startRange = result.range(of: ""),
+ let endRange = result.range(of: ""),
+ startRange.upperBound <= endRange.lowerBound {
+ result.removeSubrange(startRange.lowerBound..", options: .backwards),
+ result.range(of: "", range: trailingStart.upperBound..= 2 {
+ let firstSentence = sentences[0].trimmingCharacters(in: .whitespacesAndNewlines)
+ if firstSentence.count > 20 {
+ return firstSentence + "..."
+ }
+ }
+
+ if thinking.count > 80 {
+ let truncated = String(thinking.prefix(80))
+ if let lastSpace = truncated.lastIndex(of: " ") {
+ return String(truncated[.. String {
+ guard viewModel.loadedModelSupportsThinking else {
+ return "Not available for the currently loaded model."
+ }
+ return viewModel.thinkingModeEnabled
+ ? "Model will use its default thinking/reasoning mode."
+ : "Thinking disabled. The model will skip its reasoning step."
+}
+
// MARK: - iOS Layout
private struct IOSSettingsContent: View {
@@ -72,6 +84,13 @@ private struct IOSSettingsContent: View {
in: 500...20000,
step: 500
)
+
+ Toggle("Thinking Mode", isOn: $viewModel.thinkingModeEnabled)
+ .disabled(!viewModel.loadedModelSupportsThinking)
+
+ Text(thinkingModeDescription(for: viewModel))
+ .font(AppTypography.caption)
+ .foregroundColor(AppColors.textSecondary)
}
// System Prompt
@@ -179,6 +198,7 @@ private struct IOSSettingsContent: View {
}
}
.navigationTitle("Settings")
+ .scrollDismissesKeyboard(.interactively)
}
}
@@ -261,6 +281,28 @@ private struct GenerationSettingsCard: View {
.frame(maxWidth: 400)
}
}
+
+ HStack {
+ Text("Thinking Mode")
+ .frame(width: 150, alignment: .leading)
+
+ Toggle("", isOn: $viewModel.thinkingModeEnabled)
+ .disabled(!viewModel.loadedModelSupportsThinking)
+
+ Spacer()
+
+ Text(viewModel.thinkingModeEnabled ? "Enabled" : "Disabled")
+ .font(AppTypography.caption)
+ .foregroundColor(
+ viewModel.thinkingModeEnabled
+ ? AppColors.primaryPurple
+ : AppColors.textSecondary
+ )
+ }
+
+ Text(thinkingModeDescription(for: viewModel))
+ .font(AppTypography.caption)
+ .foregroundColor(AppColors.textSecondary)
}
}
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift
index bf14368c9..1cd536604 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/SettingsViewModel.swift
@@ -10,6 +10,7 @@ import Foundation
import SwiftUI
import RunAnywhere
import Combine
+import os
@MainActor
class SettingsViewModel: ObservableObject {
@@ -18,7 +19,9 @@ class SettingsViewModel: ObservableObject {
// Generation Settings
@Published var temperature: Double = 0.7
@Published var maxTokens: Int = 10000
- @Published var systemPrompt: String = ""
+ @Published var systemPrompt: String = "You are a helpful, concise AI assistant."
+ @Published var thinkingModeEnabled: Bool = false
+ @Published private(set) var loadedModelSupportsThinking: Bool = false
// API Configuration
@Published var apiKey: String = ""
@@ -43,6 +46,7 @@ class SettingsViewModel: ObservableObject {
// MARK: - Private Properties
+ private let logger = Logger(subsystem: "com.runanywhere.RunAnywhereAI", category: "Settings")
private var cancellables = Set()
private let keychainService = KeychainService.shared
private let apiKeyStorageKey = "runanywhere_api_key"
@@ -50,6 +54,7 @@ class SettingsViewModel: ObservableObject {
private let temperatureDefaultsKey = "defaultTemperature"
private let maxTokensDefaultsKey = "defaultMaxTokens"
private let systemPromptDefaultsKey = "defaultSystemPrompt"
+ private let thinkingModeKey = "thinkingModeEnabled"
private let analyticsLogKey = "analyticsLogToLocal"
private let deviceRegisteredKey = "com.runanywhere.sdk.deviceRegistered"
@@ -92,6 +97,38 @@ class SettingsViewModel: ObservableObject {
init() {
loadSettings()
setupObservers()
+ subscribeToModelNotifications()
+ }
+
+ private func subscribeToModelNotifications() {
+ // Subscribe to SDK events directly so any LLM model load
+ // (from chat, voice agent, or RAG) updates the thinking mode flag.
+ RunAnywhere.events.events
+ .receive(on: DispatchQueue.main)
+ .sink { [weak self] event in
+ Task { @MainActor in
+ self?.handleSDKEvent(event)
+ }
+ }
+ .store(in: &cancellables)
+ }
+
+ private func handleSDKEvent(_ event: any SDKEvent) {
+ guard event.category == .llm else { return }
+
+ switch event.type {
+ case "llm_model_load_completed":
+ let modelId = event.properties["model_id"] ?? ""
+ if let model = ModelListViewModel.shared.availableModels.first(where: { $0.id == modelId }) {
+ loadedModelSupportsThinking = model.supportsThinking
+ logger.info("LLM loaded (\(modelId)), supportsThinking: \(model.supportsThinking)")
+ }
+ case "llm_model_unloaded":
+ loadedModelSupportsThinking = false
+ logger.info("LLM unloaded, thinking mode disabled")
+ default:
+ break
+ }
}
// MARK: - Setup
@@ -124,6 +161,14 @@ class SettingsViewModel: ObservableObject {
}
.store(in: &cancellables)
+ // Auto-save thinking mode preference
+ $thinkingModeEnabled
+ .dropFirst()
+ .sink { [weak self] newValue in
+ self?.saveThinkingModePreference(newValue)
+ }
+ .store(in: &cancellables)
+
// Auto-save analytics logging preference
$analyticsLogToLocal
.dropFirst() // Skip initial value to avoid saving on init
@@ -143,16 +188,23 @@ class SettingsViewModel: ObservableObject {
}
private func loadGenerationSettings() {
- // Load temperature
- let savedTemperature = UserDefaults.standard.double(forKey: temperatureDefaultsKey)
- temperature = savedTemperature > 0 ? savedTemperature : 0.7
+ // Load temperature — use object(forKey:) to distinguish unset (nil) from explicit 0.0
+ let savedTemperature = UserDefaults.standard.object(forKey: temperatureDefaultsKey) as? Double
+ temperature = savedTemperature ?? 0.7
// Load max tokens
let savedMaxTokens = UserDefaults.standard.integer(forKey: maxTokensDefaultsKey)
maxTokens = savedMaxTokens > 0 ? savedMaxTokens : 10000
- // Load system prompt
- systemPrompt = UserDefaults.standard.string(forKey: systemPromptDefaultsKey) ?? ""
+ // Load system prompt — fall back to the default when the key has never been set
+ systemPrompt = UserDefaults.standard.string(forKey: systemPromptDefaultsKey) ?? "You are a helpful, concise AI assistant."
+ // Persist the default so that other ViewModels reading UserDefaults directly always find a value
+ if UserDefaults.standard.string(forKey: systemPromptDefaultsKey) == nil {
+ UserDefaults.standard.set(systemPrompt, forKey: systemPromptDefaultsKey)
+ }
+
+ // Load thinking mode
+ thinkingModeEnabled = UserDefaults.standard.bool(forKey: thinkingModeKey)
}
private func loadApiKeyConfiguration() {
@@ -200,12 +252,18 @@ class SettingsViewModel: ObservableObject {
print("Settings: Saved system prompt (\(value.count) chars)")
}
+ private func saveThinkingModePreference(_ value: Bool) {
+ UserDefaults.standard.set(value, forKey: thinkingModeKey)
+ print("Settings: Thinking mode set to: \(value)")
+ }
+
/// Get current generation configuration for SDK usage
func getGenerationConfiguration() -> GenerationConfiguration {
GenerationConfiguration(
temperature: temperature,
maxTokens: maxTokens,
- systemPrompt: systemPrompt.isEmpty ? nil : systemPrompt
+ systemPrompt: systemPrompt.isEmpty ? nil : systemPrompt,
+ thinkingModeEnabled: thinkingModeEnabled
)
}
@@ -418,4 +476,5 @@ struct GenerationConfiguration {
let temperature: Double
let maxTokens: Int
let systemPrompt: String?
+ let thinkingModeEnabled: Bool
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift
index dec41a8ef..965704521 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Settings/ToolSettingsView.swift
@@ -84,7 +84,23 @@ class ToolSettingsViewModel: ObservableObject {
category: "Utility"
),
executor: { args in
- let expression = args["expression"]?.stringValue ?? args["input"]?.stringValue ?? "0"
+ // Extract expression from args, handling both string and number ToolValue types
+ let expression: String = {
+ let keys = ["expression", "input", "expr"]
+ for key in keys {
+ if let val = args[key] {
+ if let s = val.stringValue { return s }
+ if let n = val.numberValue { return "\(n)" }
+ }
+ }
+ // Fallback: try any value in the dict
+ for val in args.values {
+ if let s = val.stringValue { return s }
+ if let n = val.numberValue { return "\(n)" }
+ }
+ return "0"
+ }()
+ print("Calculator received args: \(args), using expression: '\(expression)'")
// Clean the expression - remove any non-math characters
let cleanedExpression = expression
.replacingOccurrences(of: "=", with: "")
@@ -93,16 +109,22 @@ class ToolSettingsViewModel: ObservableObject {
.replacingOccurrences(of: "÷", with: "/")
.trimmingCharacters(in: .whitespacesAndNewlines)
- do {
- let exp = NSExpression(format: cleanedExpression)
- if let result = exp.expressionValue(with: nil, context: nil) as? NSNumber {
- return [
- "result": .number(result.doubleValue),
- "expression": .string(expression)
- ]
- }
- } catch {
- // Fall through to error
+ // Validate expression contains only safe math characters
+ let allowedChars = CharacterSet(charactersIn: "0123456789.+-*/() ")
+ guard cleanedExpression.unicodeScalars.allSatisfy({ allowedChars.contains($0) }),
+ !cleanedExpression.isEmpty else {
+ return [
+ "error": .string("Could not evaluate expression: \(expression)"),
+ "expression": .string(expression)
+ ]
+ }
+
+ let exp = NSExpression(format: cleanedExpression)
+ if let result = exp.expressionValue(with: nil, context: nil) as? NSNumber {
+ return [
+ "result": .number(result.doubleValue),
+ "expression": .string(expression)
+ ]
}
return [
"error": .string("Could not evaluate expression: \(expression)"),
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift
index 80f0ab56a..6f7ad685d 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Vision/VLMCameraView.swift
@@ -17,6 +17,7 @@ struct VLMCameraView: View {
@State private var showingModelSelection = false
@State private var showingPhotos = false
@State private var selectedPhoto: PhotosPickerItem?
+ @Environment(\.scenePhase) private var scenePhase
var body: some View {
ZStack {
@@ -52,6 +53,14 @@ struct VLMCameraView: View {
viewModel.stopAutoStreaming()
viewModel.stopCamera()
}
+ .onChange(of: scenePhase) { _, newPhase in
+ if newPhase == .background || newPhase == .inactive {
+ viewModel.stopAutoStreaming()
+ viewModel.stopCamera()
+ } else if newPhase == .active {
+ setupCameraIfNeeded()
+ }
+ }
}
// MARK: - Main Content
@@ -287,8 +296,10 @@ struct VLMCameraView: View {
private func setupCameraIfNeeded() {
Task {
await viewModel.checkCameraAuthorization()
- if viewModel.isCameraAuthorized && viewModel.captureSession == nil {
- viewModel.setupCamera()
+ if viewModel.isCameraAuthorized {
+ if viewModel.captureSession == nil {
+ viewModel.setupCamera()
+ }
viewModel.startCamera()
}
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift
index ce876682c..3507fa840 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAgentViewModel.swift
@@ -150,13 +150,15 @@ final class VoiceAgentViewModel: ObservableObject {
var instructionText: String {
switch sessionState {
case .listening:
- return "Listening... Pause to send"
+ return "Tap to send · Hold to stop"
case .processing:
return "Processing your message..."
case .speaking:
return "Speaking..."
case .connecting:
return "Connecting..."
+ case .connected:
+ return "Tap to speak · Hold to end"
default:
return "Tap to start conversation"
}
@@ -387,7 +389,13 @@ final class VoiceAgentViewModel: ObservableObject {
assistantResponse = ""
do {
- session = try await RunAnywhere.startVoiceSession()
+ let settings = SettingsViewModel.shared
+ let voiceConfig = VoiceSessionConfig(
+ continuousMode: false,
+ thinkingModeEnabled: settings.loadedModelSupportsThinking && settings.thinkingModeEnabled,
+ maxTokens: settings.maxTokens
+ )
+ session = try await RunAnywhere.startVoiceSession(config: voiceConfig)
sessionState = .listening
currentStatus = "Listening..."
eventTask = Task { [weak self] in
@@ -419,12 +427,24 @@ final class VoiceAgentViewModel: ObservableObject {
logger.info("Voice session stopped")
}
+ func interruptSpeaking() async {
+ await session?.interruptPlayback()
+ }
+
/// Force send current audio buffer (for push-to-talk mode)
func sendAudioNow() async {
await session?.sendNow()
logger.debug("Forced audio send")
}
+ /// Resume listening on the current session (push-to-talk: user taps mic after turn completes)
+ func resumeListening() async {
+ await session?.resumeListening()
+ sessionState = .listening
+ currentStatus = "Listening..."
+ logger.debug("Resumed listening")
+ }
+
// MARK: - Session Event Handling
private func handleSessionEvent(_ event: VoiceSessionEvent) {
@@ -434,11 +454,11 @@ final class VoiceAgentViewModel: ObservableObject {
case .speechStarted: isSpeechDetected = true; currentStatus = "Listening..."
case .processing: sessionState = .processing; currentStatus = "Processing..."; isSpeechDetected = false
case .transcribed(let text): currentTranscript = text
- case .responded(let text): assistantResponse = text
+ case .responded(let text, _): assistantResponse = text
case .speaking: sessionState = .speaking; currentStatus = "Speaking..."
- case let .turnCompleted(transcript, response, _):
+ case let .turnCompleted(transcript, response, _, _):
currentTranscript = transcript; assistantResponse = response
- sessionState = .listening; currentStatus = "Listening..."
+ sessionState = .connected; currentStatus = "Ready"
case .stopped: sessionState = .disconnected; currentStatus = "Ready"
case .error(let message): logger.error("Session error: \(message)"); errorMessage = message
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift
index 099a111ea..a566da458 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Features/Voice/VoiceAssistantView.swift
@@ -418,16 +418,28 @@ extension VoiceAssistantView {
isLoading: isLoading,
activeColor: viewModel.micButtonColor.swiftUIColor,
inactiveColor: viewModel.micButtonColor.swiftUIColor,
- icon: viewModel.micButtonIcon
- ) {
- Task {
- if viewModel.isActive {
- await viewModel.stopConversation()
- } else {
- await viewModel.startConversation()
+ icon: viewModel.micButtonIcon,
+ action: {
+ Task {
+ if viewModel.isSpeaking {
+ await viewModel.interruptSpeaking()
+ } else if viewModel.isListening {
+ await viewModel.sendAudioNow()
+ } else if viewModel.sessionState == .connected {
+ await viewModel.resumeListening()
+ } else if !viewModel.isActive {
+ await viewModel.startConversation()
+ }
+ }
+ },
+ onLongPress: {
+ Task {
+ if viewModel.isActive || viewModel.sessionState == .connected {
+ await viewModel.stopConversation()
+ }
}
}
- }
+ )
Spacer()
}
diff --git a/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift b/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift
index ecf4670b0..c42d996a6 100644
--- a/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift
+++ b/examples/ios/RunAnywhereAI/RunAnywhereAI/Helpers/AdaptiveLayout.swift
@@ -449,6 +449,7 @@ struct AdaptiveMicButton: View {
let inactiveColor: Color
let icon: String
let action: () -> Void
+ let onLongPress: (() -> Void)?
init(
isActive: Bool = false,
@@ -457,7 +458,8 @@ struct AdaptiveMicButton: View {
activeColor: Color = .red,
inactiveColor: Color = AppColors.primaryAccent,
icon: String = "mic.fill",
- action: @escaping () -> Void
+ action: @escaping () -> Void,
+ onLongPress: (() -> Void)? = nil
) {
self.isActive = isActive
self.isPulsing = isPulsing
@@ -466,83 +468,55 @@ struct AdaptiveMicButton: View {
self.inactiveColor = inactiveColor
self.icon = icon
self.action = action
+ self.onLongPress = onLongPress
+ }
+
+ private var micContent: some View {
+ ZStack {
+ // Background circle
+ Circle()
+ .fill(isActive ? activeColor : inactiveColor)
+ .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
+
+ // Pulsing effect when active
+ if isPulsing {
+ Circle()
+ .stroke(Color.white.opacity(0.4), lineWidth: 2)
+ .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
+ .scaleEffect(1.3)
+ .opacity(0)
+ .animation(
+ .easeOut(duration: 1.0).repeatForever(autoreverses: false),
+ value: isPulsing
+ )
+ }
+
+ // Icon or loading indicator
+ if isLoading {
+ ProgressView()
+ .progressViewStyle(CircularProgressViewStyle(tint: .white))
+ .scaleEffect(1.2)
+ } else {
+ Image(systemName: icon)
+ .font(.system(size: AdaptiveSizing.micIconSize))
+ .foregroundColor(.white)
+ .contentTransition(.symbolEffect(.replace))
+ .animation(.smooth(duration: 0.3), value: icon)
+ }
+ }
}
var body: some View {
Group {
if #available(iOS 26.0, macOS 26.0, *) {
- Button(action: action) {
- ZStack {
- // Background circle
- Circle()
- .fill(isActive ? activeColor : inactiveColor)
- .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
-
- // Pulsing effect when active
- if isPulsing {
- Circle()
- .stroke(Color.white.opacity(0.4), lineWidth: 2)
- .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
- .scaleEffect(1.3)
- .opacity(0)
- .animation(
- .easeOut(duration: 1.0).repeatForever(autoreverses: false),
- value: isPulsing
- )
- }
-
- // Icon or loading indicator
- if isLoading {
- ProgressView()
- .progressViewStyle(CircularProgressViewStyle(tint: .white))
- .scaleEffect(1.2)
- } else {
- Image(systemName: icon)
- .font(.system(size: AdaptiveSizing.micIconSize))
- .foregroundColor(.white)
- .contentTransition(.symbolEffect(.replace))
- .animation(.smooth(duration: 0.3), value: icon)
- }
- }
- }
- .buttonStyle(.plain)
- .glassEffect(.regular.interactive())
+ micContent
+ .onLongPressGesture(minimumDuration: 0.5, perform: { onLongPress?() ?? action() })
+ .onTapGesture(perform: action)
+ .glassEffect(.regular.interactive())
} else {
- Button(action: action) {
- ZStack {
- // Background circle
- Circle()
- .fill(isActive ? activeColor : inactiveColor)
- .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
-
- // Pulsing effect when active
- if isPulsing {
- Circle()
- .stroke(Color.white.opacity(0.4), lineWidth: 2)
- .frame(width: AdaptiveSizing.micButtonSize, height: AdaptiveSizing.micButtonSize)
- .scaleEffect(1.3)
- .opacity(0)
- .animation(
- .easeOut(duration: 1.0).repeatForever(autoreverses: false),
- value: isPulsing
- )
- }
-
- // Icon or loading indicator
- if isLoading {
- ProgressView()
- .progressViewStyle(CircularProgressViewStyle(tint: .white))
- .scaleEffect(1.2)
- } else {
- Image(systemName: icon)
- .font(.system(size: AdaptiveSizing.micIconSize))
- .foregroundColor(.white)
- .contentTransition(.symbolEffect(.replace))
- .animation(.smooth(duration: 0.3), value: icon)
- }
- }
- }
- .buttonStyle(.plain)
+ micContent
+ .onLongPressGesture(minimumDuration: 0.5, perform: { onLongPress?() ?? action() })
+ .onTapGesture(perform: action)
}
}
}
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
index 76218a7ae..fe1f57524 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -679,6 +679,8 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
std::string partial_utf8_buffer;
partial_utf8_buffer.reserve(8);
+ Utf8State scanner_state;
+
int n_cur = batch.n_tokens;
int tokens_generated = 0;
bool stop_sequence_hit = false;
@@ -696,11 +698,11 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
const std::string new_token_chars =
common_token_to_piece(context_, new_token_id);
+ const size_t old_partial_size = partial_utf8_buffer.size();
partial_utf8_buffer.append(new_token_chars);
- Utf8State scanner_state;
size_t valid_upto = 0;
- for (size_t i = 0; i < partial_utf8_buffer.size(); ++i) {
+ for (size_t i = old_partial_size; i < partial_utf8_buffer.size(); ++i) {
scanner_state.process(static_cast(partial_utf8_buffer[i]));
if (scanner_state.state == 0) {
valid_upto = i + 1;
@@ -735,12 +737,17 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
if (stop_window.size() > MAX_STOP_LEN) {
size_t safe_len = stop_window.size() - MAX_STOP_LEN;
- if (!callback(stop_window.substr(0, safe_len))) {
- LOGI("Generation cancelled by callback");
- cancel_requested_.store(true);
- break;
+ while (safe_len > 0 && (stop_window[safe_len] & 0xC0) == 0x80) {
+ safe_len--;
+ }
+ if (safe_len > 0) {
+ if (!callback(stop_window.substr(0, safe_len))) {
+ LOGI("Generation cancelled by callback");
+ cancel_requested_.store(true);
+ break;
+ }
+ stop_window.erase(0, safe_len);
}
- stop_window.erase(0, safe_len);
}
}
@@ -973,6 +980,8 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen
std::string partial_utf8_buffer;
partial_utf8_buffer.reserve(8);
+ Utf8State scanner_state;
+
std::string generated_text;
int n_cur = static_cast(current_pos) + n_prompt;
int tokens_generated = 0;
@@ -987,38 +996,17 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen
}
const std::string new_token_chars = common_token_to_piece(context_, new_token_id);
+ const size_t old_partial_size = partial_utf8_buffer.size();
partial_utf8_buffer.append(new_token_chars);
- struct Utf8Check {
- static size_t valid_upto(const std::string& buf) {
- static const uint8_t utf8d[] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3,
- 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,
- 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1,
- 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,
- 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1,
- 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- };
- uint32_t state = 0;
- size_t upto = 0;
- for (size_t i = 0; i < buf.size(); ++i) {
- uint32_t type = utf8d[static_cast(buf[i])];
- state = utf8d[256 + state * 16 + type];
- if (state == 0) upto = i + 1;
- }
- return upto;
+ size_t valid_upto = 0;
+ for (size_t i = old_partial_size; i < partial_utf8_buffer.size(); ++i) {
+ scanner_state.process(static_cast(partial_utf8_buffer[i]));
+ if (scanner_state.state == 0) {
+ valid_upto = i + 1;
}
- };
+ }
- const size_t valid_upto = Utf8Check::valid_upto(partial_utf8_buffer);
if (valid_upto > 0) {
std::string valid_chunk = partial_utf8_buffer.substr(0, valid_upto);
stop_window.append(valid_chunk);
@@ -1042,9 +1030,14 @@ TextGenerationResult LlamaCppTextGeneration::generate_from_context(const TextGen
}
if (stop_window.size() > MAX_STOP_LEN) {
- const size_t safe_len = stop_window.size() - MAX_STOP_LEN;
- generated_text += stop_window.substr(0, safe_len);
- stop_window.erase(0, safe_len);
+ size_t safe_len = stop_window.size() - MAX_STOP_LEN;
+ while (safe_len > 0 && (stop_window[safe_len] & 0xC0) == 0x80) {
+ safe_len--;
+ }
+ if (safe_len > 0) {
+ generated_text += stop_window.substr(0, safe_len);
+ stop_window.erase(0, safe_len);
+ }
}
}
diff --git a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
index d09f739ea..d13846b7f 100644
--- a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
+++ b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
@@ -9,6 +9,7 @@
* Do NOT add features not present in the Swift code.
*/
+#include
#include
#include
#include
@@ -45,6 +46,9 @@ struct rac_llm_component {
/** Mutex for thread safety */
std::mutex mtx;
+ /** Cancellation flag - set by cancel(), read by token callback without holding mtx */
+ std::atomic cancel_requested{false};
+
/** Resolved inference framework (defaults to LlamaCPP, the primary LLM backend) */
rac_inference_framework_t actual_framework;
@@ -509,6 +513,8 @@ struct llm_stream_context {
float temperature;
int32_t max_tokens;
int32_t token_count; // Track tokens for streaming updates
+
+ std::atomic* cancel_flag;
};
/**
@@ -517,6 +523,10 @@ struct llm_stream_context {
static rac_bool_t llm_stream_token_callback(const char* token, void* user_data) {
auto* ctx = reinterpret_cast(user_data);
+ if (ctx->cancel_flag && ctx->cancel_flag->load(std::memory_order_relaxed)) {
+ return RAC_FALSE;
+ }
+
// Track first token time and emit first token event
if (!ctx->first_token_recorded) {
ctx->first_token_recorded = true;
@@ -576,6 +586,8 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
auto* component = reinterpret_cast(handle);
std::lock_guard lock(component->mtx);
+ component->cancel_requested.store(false, std::memory_order_relaxed);
+
// Generate unique ID for this generation
std::string generation_id = generate_unique_id();
const char* model_id = rac_lifecycle_get_model_id(component->lifecycle);
@@ -667,6 +679,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
ctx.temperature = effective_options->temperature;
ctx.max_tokens = effective_options->max_tokens;
ctx.token_count = 0;
+ ctx.cancel_flag = &component->cancel_requested;
// Perform streaming generation
result = rac_llm_generate_stream(service, prompt, effective_options, llm_stream_token_callback,
@@ -702,7 +715,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
rac_llm_result_t final_result = {};
final_result.text = strdup(ctx.full_text.c_str());
final_result.prompt_tokens = ctx.prompt_tokens;
- final_result.completion_tokens = estimate_tokens(ctx.full_text.c_str());
+ final_result.completion_tokens = ctx.token_count > 0 ? ctx.token_count : estimate_tokens(ctx.full_text.c_str());
final_result.total_tokens = final_result.prompt_tokens + final_result.completion_tokens;
final_result.total_time_ms = total_time_ms;
@@ -761,7 +774,8 @@ extern "C" rac_result_t rac_llm_component_cancel(rac_handle_t handle) {
return RAC_ERROR_INVALID_HANDLE;
auto* component = reinterpret_cast(handle);
- std::lock_guard lock(component->mtx);
+
+ component->cancel_requested.store(true, std::memory_order_relaxed);
rac_handle_t service = rac_lifecycle_get_service(component->lifecycle);
if (service) {
diff --git a/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp b/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp
index 4212d4abb..0ca01a419 100644
--- a/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp
+++ b/sdk/runanywhere-commons/src/features/llm/tool_calling.cpp
@@ -398,13 +398,41 @@ static bool extract_json_value(const char* json_obj, const char* key, char** out
*out_is_object = true;
return true;
}
+ } else {
+ // Scalar value (number, boolean, null)
+ // Read until comma, closing brace, or whitespace
+ size_t val_start = pos;
+ size_t val_end = pos;
+ while (val_end < len && json_obj[val_end] != ',' &&
+ json_obj[val_end] != '}' && json_obj[val_end] != ']' &&
+ json_obj[val_end] != '\n') {
+ val_end++;
+ }
+ // Trim trailing whitespace
+ while (val_end > val_start &&
+ (json_obj[val_end - 1] == ' ' || json_obj[val_end - 1] == '\t')) {
+ val_end--;
+ }
+ if (val_end > val_start) {
+ size_t val_len = val_end - val_start;
+ *out_value = static_cast(malloc(val_len + 1));
+ if (*out_value) {
+ memcpy(*out_value, json_obj + val_start, val_len);
+ (*out_value)[val_len] = '\0';
+ }
+ *out_is_object = false;
+ return true;
+ }
}
}
}
}
// Move to end of key for continued scanning
+ // Skip the in_string toggle - extract_json_string already
+ // consumed the closing quote so in_string must stay false.
i = key_end - 1;
+ continue;
}
}
in_string = !in_string;
@@ -663,10 +691,46 @@ static bool extract_tool_name_and_args(const char* json_obj, char** out_tool_nam
}
}
- // No arguments found - use empty object
- *out_args_json = static_cast(malloc(3));
- if (*out_args_json) {
- std::memcpy(*out_args_json, "{}", 3);
+ // No standard argument wrapper key found.
+ // Fallback: collect all remaining keys (excluding the tool name key)
+ // as flat arguments. This handles LLM output like:
+ // {"tool": "calculate", "expression": "5 * 100"}
+ {
+ std::vector all_keys = get_json_keys(json_obj);
+ std::string flat_args = "{";
+ bool first = true;
+ for (const auto& k : all_keys) {
+ // Skip the key that matched the tool name
+ bool is_tool_key = false;
+ for (int t = 0; TOOL_NAME_KEYS[t] != nullptr; t++) {
+ if (str_equals_ignore_case(k.c_str(), TOOL_NAME_KEYS[t])) {
+ is_tool_key = true;
+ break;
+ }
+ }
+ if (is_tool_key) continue;
+
+ char* kval = nullptr;
+ bool kval_is_obj = false;
+ if (extract_json_value(json_obj, k.c_str(), &kval, &kval_is_obj)) {
+ if (!first) flat_args += ",";
+ std::string escaped_key = escape_json_string(k.c_str());
+ if (kval_is_obj) {
+ flat_args += "\"" + escaped_key + "\":" + std::string(kval);
+ } else if (kval) {
+ std::string escaped_val = escape_json_string(kval);
+ flat_args += "\"" + escaped_key + "\":\"" + escaped_val + "\"";
+ }
+ free(kval);
+ first = false;
+ }
+ }
+ flat_args += "}";
+
+ *out_args_json = static_cast(malloc(flat_args.size() + 1));
+ if (*out_args_json) {
+ std::memcpy(*out_args_json, flat_args.c_str(), flat_args.size() + 1);
+ }
}
return true;
}
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift
index 09baad44d..b6b3413a8 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Features/TTS/Services/AudioPlaybackManager.swift
@@ -87,7 +87,7 @@ public class AudioPlaybackManager: NSObject, ObservableObject, AVAudioPlayerDele
/// Stop current playback
public func stop() {
- guard isPlaying else { return }
+ guard audioPlayer != nil else { return }
audioPlayer?.stop()
cleanupPlayback(success: false)
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift
index 0542583be..823c513b8 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Foundation/Bridge/CppBridge.swift
@@ -179,6 +179,15 @@ public enum CppBridge {
guard wasInitialized else { return }
+ Task {
+ await LLM.shared.destroy()
+ await STT.shared.destroy()
+ await TTS.shared.destroy()
+ await VAD.shared.destroy()
+ await VoiceAgent.shared.destroy()
+ await VLM.shared.destroy()
+ }
+
// Shutdown in reverse order
// Note: ModelAssignment and Platform callbacks remain valid (static)
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift
index 3123541de..ec988e980 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Infrastructure/Download/Utilities/ArchiveUtility.swift
@@ -190,7 +190,9 @@ public final class ArchiveUtility {
/// Decompress raw deflate data using streaming compression_stream_process.
/// Uses a small 256 KB output buffer instead of pre-allocating compressedSize * N.
private static func decompressDeflateStreaming(_ data: Data, range: Range) throws -> Data {
- var stream = compression_stream()
+ let placeholder = UnsafeMutablePointer.allocate(capacity: 1)
+ defer { placeholder.deallocate() }
+ var stream = compression_stream(dst_ptr: placeholder, dst_size: 0, src_ptr: placeholder, src_size: 0, state: nil)
guard compression_stream_init(&stream, COMPRESSION_STREAM_DECODE, COMPRESSION_ZLIB) == COMPRESSION_STATUS_OK else {
throw SDKError.download(.extractionFailed, "Failed to initialize decompression stream")
}
@@ -218,7 +220,7 @@ public final class ArchiveUtility {
stream.dst_ptr = outputBuffer
stream.dst_size = outputChunkSize
- status = compression_stream_process(&stream, COMPRESSION_STREAM_FINALIZE)
+ status = compression_stream_process(&stream, Int32(COMPRESSION_STREAM_FINALIZE.rawValue))
let bytesProduced = outputChunkSize - stream.dst_size
if bytesProduced > 0 {
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
index bc721a59c..ba4091e80 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+TextGeneration.swift
@@ -85,19 +85,21 @@ public extension RunAnywhere {
let totalTimeMs = endTime.timeIntervalSince(startTime) * 1000
// Extract result
- let generatedText: String
+ let rawText: String
if let textPtr = llmResult.text {
- generatedText = String(cString: textPtr)
+ rawText = String(cString: textPtr)
} else {
- generatedText = ""
+ rawText = ""
}
let inputTokens = Int(llmResult.prompt_tokens)
let outputTokens = Int(llmResult.completion_tokens)
let tokensPerSecond = llmResult.tokens_per_second > 0 ? Double(llmResult.tokens_per_second) : 0
+ let (generatedText, thinkingContent) = ThinkingContentParser.extract(from: rawText)
+
return LLMGenerationResult(
text: generatedText,
- thinkingContent: nil,
+ thinkingContent: thinkingContent,
inputTokens: inputTokens,
tokensUsed: outputTokens,
modelUsed: modelId,
@@ -105,7 +107,7 @@ public extension RunAnywhere {
framework: "llamacpp",
tokensPerSecond: tokensPerSecond,
timeToFirstTokenMs: nil,
- thinkingTokens: 0,
+ thinkingTokens: thinkingContent.map { _ in outputTokens } ?? 0,
responseTokens: outputTokens
)
}
@@ -189,47 +191,43 @@ public extension RunAnywhere {
) -> AsyncThrowingStream {
AsyncThrowingStream { continuation in
Task {
- do {
- await collector.markStart()
-
- let context = LLMStreamCallbackContext(continuation: continuation, collector: collector)
- let contextPtr = Unmanaged.passRetained(context).toOpaque()
-
- let callbacks = LLMStreamCallbacks.create()
- var cOptions = options
-
- let callCFunction: () -> rac_result_t = {
- prompt.withCString { promptPtr in
- rac_llm_component_generate_stream(
- handle,
- promptPtr,
- &cOptions,
- callbacks.token,
- callbacks.complete,
- callbacks.error,
- contextPtr
- )
- }
+ await collector.markStart()
+
+ let context = LLMStreamCallbackContext(continuation: continuation, collector: collector)
+ // passRetained: context is released in completeCallback or errorCallback
+ let contextPtr = Unmanaged.passRetained(context).toOpaque()
+
+ let callbacks = LLMStreamCallbacks.create()
+ var cOptions = options
+
+ let callCFunction: () -> rac_result_t = {
+ prompt.withCString { promptPtr in
+ rac_llm_component_generate_stream(
+ handle,
+ promptPtr,
+ &cOptions,
+ callbacks.token,
+ callbacks.complete,
+ callbacks.error,
+ contextPtr
+ )
}
+ }
- let streamResult: rac_result_t
- if let systemPrompt = systemPrompt {
- streamResult = systemPrompt.withCString { sysPtr in
- cOptions.system_prompt = sysPtr
- return callCFunction()
- }
- } else {
- cOptions.system_prompt = nil
- streamResult = callCFunction()
+ let streamResult: rac_result_t
+ if let systemPrompt = systemPrompt {
+ streamResult = systemPrompt.withCString { sysPtr in
+ cOptions.system_prompt = sysPtr
+ return callCFunction()
}
+ } else {
+ cOptions.system_prompt = nil
+ streamResult = callCFunction()
+ }
- if streamResult != RAC_SUCCESS {
- Unmanaged.fromOpaque(contextPtr).release()
- let error = SDKError.llm(.generationFailed, "Stream generation failed: \(streamResult)")
- continuation.finish(throwing: error)
- await collector.markFailed(error)
- }
- } catch {
+ if streamResult != RAC_SUCCESS {
+ Unmanaged.fromOpaque(contextPtr).release()
+ let error = SDKError.llm(.generationFailed, "Stream generation failed: \(streamResult)")
continuation.finish(throwing: error)
await collector.markFailed(error)
}
@@ -255,6 +253,7 @@ private enum LLMStreamCallbacks {
static func create() -> Callbacks {
let tokenCallback: TokenFn = { tokenPtr, userData -> rac_bool_t in
guard let tokenPtr = tokenPtr, let userData = userData else { return RAC_TRUE }
+ if Task.isCancelled { return RAC_FALSE }
let ctx = Unmanaged.fromOpaque(userData).takeUnretainedValue()
let token = String(cString: tokenPtr)
Task {
@@ -264,16 +263,28 @@ private enum LLMStreamCallbacks {
return RAC_TRUE
}
- let completeCallback: CompleteFn = { _, userData in
+ let completeCallback: CompleteFn = { resultPtr, userData in
guard let userData = userData else { return }
- let ctx = Unmanaged.fromOpaque(userData).takeUnretainedValue()
+ let ctx = Unmanaged.fromOpaque(userData).takeRetainedValue()
ctx.continuation.finish()
- Task { await ctx.collector.markComplete() }
+
+ if let result = resultPtr?.pointee {
+ Task {
+ await ctx.collector.markCompleteWithMetrics(
+ promptTokens: Int(result.prompt_tokens),
+ completionTokens: Int(result.completion_tokens),
+ tokensPerSecond: Double(result.tokens_per_second),
+ timeToFirstTokenMs: Double(result.time_to_first_token_ms)
+ )
+ }
+ } else {
+ Task { await ctx.collector.markComplete() }
+ }
}
let errorCallback: ErrorFn = { _, errorMsg, userData in
guard let userData = userData else { return }
- let ctx = Unmanaged.fromOpaque(userData).takeUnretainedValue()
+ let ctx = Unmanaged.fromOpaque(userData).takeRetainedValue()
let message = errorMsg.map { String(cString: $0) } ?? "Unknown error"
let error = SDKError.llm(.generationFailed, message)
ctx.continuation.finish(throwing: error)
@@ -296,6 +307,34 @@ private final class LLMStreamCallbackContext: @unchecked Sendable {
}
}
+// MARK: - Thinking Content Parser
+
+enum ThinkingContentParser {
+ /// Extracts `...` content from generated text.
+ /// - Returns: Tuple of (responseText, thinkingContent). If no tags found, responseText = original text, thinkingContent = nil.
+ static func extract(from text: String) -> (text: String, thinking: String?) {
+ guard let startRange = text.range(of: ""),
+ let endRange = text.range(of: ""),
+ startRange.upperBound <= endRange.lowerBound else {
+ return (text: text, thinking: nil)
+ }
+ let thinkingContent = String(text[startRange.upperBound.. and after
+ let textBefore = String(text[..?
+ private var cppPromptTokens: Int?
+ private var cppCompletionTokens: Int?
+ private var cppTokensPerSecond: Double?
+ private var cppTimeToFirstTokenMs: Double?
+
init(modelId: String, promptLength: Int) {
self.modelId = modelId
self.promptLength = promptLength
@@ -339,6 +383,24 @@ private actor LLMStreamingMetricsCollector {
}
}
+ func markCompleteWithMetrics(
+ promptTokens: Int,
+ completionTokens: Int,
+ tokensPerSecond: Double,
+ timeToFirstTokenMs: Double
+ ) {
+ if promptTokens > 0 { cppPromptTokens = promptTokens }
+ if completionTokens > 0 { cppCompletionTokens = completionTokens }
+ if tokensPerSecond > 0 { cppTokensPerSecond = tokensPerSecond }
+ if timeToFirstTokenMs > 0 { cppTimeToFirstTokenMs = timeToFirstTokenMs }
+
+ isComplete = true
+ if let continuation = resultContinuation {
+ continuation.resume(returning: buildResult())
+ resultContinuation = nil
+ }
+ }
+
func markFailed(_ error: Error) {
self.error = error
if let continuation = resultContinuation {
@@ -363,20 +425,32 @@ private actor LLMStreamingMetricsCollector {
let endTime = Date()
let latencyMs = (startTime.map { endTime.timeIntervalSince($0) } ?? 0) * 1000
- var timeToFirstTokenMs: Double?
- if let start = startTime, let firstToken = firstTokenTime {
+ let timeToFirstTokenMs: Double?
+ if let cppTtft = cppTimeToFirstTokenMs {
+ timeToFirstTokenMs = cppTtft
+ } else if let start = startTime, let firstToken = firstTokenTime {
timeToFirstTokenMs = firstToken.timeIntervalSince(start) * 1000
+ } else {
+ timeToFirstTokenMs = nil
}
- // Use actual token count from streaming callbacks, not character estimation (fixes #339)
- let outputTokens = max(1, tokenCount)
- let totalTimeSec = latencyMs / 1000.0
- let tokensPerSecond = totalTimeSec > 0 ? Double(outputTokens) / totalTimeSec : 0
+ let outputTokens = cppCompletionTokens ?? max(1, tokenCount)
+ let inputTokens = cppPromptTokens ?? 0
+
+ let tokensPerSecond: Double
+ if let cppTps = cppTokensPerSecond {
+ tokensPerSecond = cppTps
+ } else {
+ let totalTimeSec = latencyMs / 1000.0
+ tokensPerSecond = totalTimeSec > 0 ? Double(outputTokens) / totalTimeSec : 0
+ }
+
+ let (responseText, thinkingContent) = ThinkingContentParser.extract(from: fullText)
return LLMGenerationResult(
- text: fullText,
- thinkingContent: nil,
- inputTokens: 0,
+ text: responseText,
+ thinkingContent: thinkingContent,
+ inputTokens: inputTokens,
tokensUsed: outputTokens,
modelUsed: modelId,
latencyMs: latencyMs,
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift
index 2a8481f7c..eef87c382 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/LLM/RunAnywhere+ToolCalling.swift
@@ -168,8 +168,17 @@ public extension RunAnywhere {
let registeredTools = await ToolRegistry.shared.getAll()
let tools = opts.tools ?? registeredTools
+ // Extract /no_think prefix before building the full prompt so it stays
+ // at the beginning where the C++ inference layer expects it.
+ let noThinkPrefix = "/no_think\n"
+ let hasNoThink = prompt.hasPrefix(noThinkPrefix)
+ let cleanPrompt = hasNoThink ? String(prompt.dropFirst(noThinkPrefix.count)) : prompt
+
let systemPrompt = buildToolSystemPrompt(tools: tools, options: opts)
- var fullPrompt = systemPrompt.isEmpty ? prompt : "\(systemPrompt)\n\nUser: \(prompt)"
+ var fullPrompt = systemPrompt.isEmpty ? cleanPrompt : "\(systemPrompt)\n\nUser: \(cleanPrompt)"
+ if hasNoThink {
+ fullPrompt = "\(noThinkPrefix)\(fullPrompt)"
+ }
var allToolCalls: [ToolCall] = []
var allToolResults: [ToolResult] = []
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift
index cda94d5c6..7d3dec55c 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/TTS/RunAnywhere+TTS.swift
@@ -95,6 +95,7 @@ public extension RunAnywhere {
// Synthesize (C++ emits events)
var ttsResult = rac_tts_result_t()
+ defer { rac_tts_result_free(&ttsResult) }
let synthesizeResult = text.withCString { textPtr in
rac_tts_component_synthesize(handle, textPtr, &cOptions, &ttsResult)
}
@@ -157,7 +158,6 @@ public extension RunAnywhere {
let voiceId = await CppBridge.TTS.shared.currentVoiceId ?? "unknown"
let startTime = Date()
- var totalAudioData = Data()
// Build C options
var cOptions = rac_tts_options_t()
@@ -166,8 +166,8 @@ public extension RunAnywhere {
cOptions.volume = options.volume
cOptions.sample_rate = Int32(options.sampleRate)
- // Create callback context
- let context = TTSStreamContext(onChunk: onAudioChunk, totalData: &totalAudioData)
+ // Create callback context - owns its own Data
+ let context = TTSStreamContext(onChunk: onAudioChunk)
let contextPtr = Unmanaged.passRetained(context).toOpaque()
let streamResult = text.withCString { textPtr in
@@ -180,13 +180,14 @@ public extension RunAnywhere {
let ctx = Unmanaged.fromOpaque(userData).takeUnretainedValue()
let chunk = Data(bytes: audioPtr, count: audioSize)
ctx.onChunk(chunk)
- ctx.totalData.pointee.append(chunk)
+ ctx.totalData.append(chunk)
},
contextPtr
)
}
- Unmanaged.fromOpaque(contextPtr).release()
+ let finalContext = Unmanaged.fromOpaque(contextPtr).takeRetainedValue()
+ let totalAudioData = finalContext.totalData
guard streamResult == RAC_SUCCESS else {
throw SDKError.tts(.processingFailed, "Streaming synthesis failed: \(streamResult)")
@@ -309,10 +310,9 @@ public extension RunAnywhere {
private final class TTSStreamContext: @unchecked Sendable {
let onChunk: (Data) -> Void
- var totalData: UnsafeMutablePointer
+ var totalData: Data = Data()
- init(onChunk: @escaping (Data) -> Void, totalData: UnsafeMutablePointer) {
+ init(onChunk: @escaping (Data) -> Void) {
self.onChunk = onChunk
- self.totalData = totalData
}
}
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift
index f96978079..d61fb8cbb 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceAgent.swift
@@ -257,10 +257,14 @@ public extension RunAnywhere {
rac_voice_agent_synthesize_speech(handle, textPtr, &audioPtr, &audioSize)
}
- guard result == RAC_SUCCESS, let ptr = audioPtr, audioSize > 0 else {
+ guard result == RAC_SUCCESS else {
throw SDKError.voiceAgent(.processingFailed, "Speech synthesis failed: \(result)")
}
+ guard let ptr = audioPtr, audioSize > 0 else {
+ return Data()
+ }
+
let audioData = Data(bytes: ptr, count: audioSize)
free(ptr)
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift
index 091842093..8e143cca9 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/RunAnywhere+VoiceSession.swift
@@ -108,6 +108,10 @@ public actor VoiceSessionHandle {
eventContinuation?.finish()
}
+ public func interruptPlayback() {
+ audioPlayback.stop()
+ }
+
/// Force process current audio (push-to-talk)
public func sendNow() async {
guard isRunning else { return }
@@ -115,6 +119,12 @@ public actor VoiceSessionHandle {
await processCurrentAudio()
}
+ /// Resume listening after a completed turn (for push-to-talk when continuousMode is false)
+ public func resumeListening() async {
+ guard isRunning else { return }
+ try? await startListening()
+ }
+
// MARK: - Private
private func emit(_ event: VoiceSessionEvent) {
@@ -196,44 +206,68 @@ public actor VoiceSessionHandle {
emit(.processing)
+ var transcription = ""
+ var cleanedResponse = ""
+ var thinkingContent: String?
+ var synthesizedAudio: Data?
+
do {
- let result = try await RunAnywhere.processVoiceTurn(audio)
+ // Step 1: Transcribe audio
+ transcription = try await RunAnywhere.voiceAgentTranscribe(audio)
- guard result.speechDetected else {
- logger.info("No speech detected")
+ guard !transcription.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
+ logger.info("No speech detected (empty transcription)")
+ emit(.turnCompleted(transcript: "", response: "", thinkingContent: nil, audio: nil))
if config.continuousMode && isRunning {
try? await startListening()
}
return
}
- // Emit intermediate results
- if let transcript = result.transcription {
- emit(.transcribed(text: transcript))
- }
+ emit(.transcribed(text: transcription))
- if let response = result.response {
- emit(.responded(text: response))
+ // Step 2: Generate LLM response (apply /no_think prefix if needed)
+ let effectivePrompt: String
+ if !config.thinkingModeEnabled {
+ effectivePrompt = "/no_think\n\(transcription)"
+ } else {
+ effectivePrompt = transcription
}
- // Play TTS if enabled
- if config.autoPlayTTS, let ttsAudio = result.synthesizedAudio, !ttsAudio.isEmpty {
- emit(.speaking)
- try await audioPlayback.play(ttsAudio)
+ let options = LLMGenerationOptions(maxTokens: config.maxTokens ?? 100)
+ let result = try await RunAnywhere.generate(effectivePrompt, options: options)
+ // generate() already runs ThinkingContentParser internally
+ cleanedResponse = result.text
+ thinkingContent = result.thinkingContent
+
+ emit(.responded(text: cleanedResponse, thinkingContent: thinkingContent))
+
+ // Step 4: Synthesize speech from cleaned response (no think tags spoken)
+ if config.autoPlayTTS, !cleanedResponse.isEmpty {
+ let ttsAudio = try await RunAnywhere.voiceAgentSynthesizeSpeech(cleanedResponse)
+ synthesizedAudio = ttsAudio
+
+ if !ttsAudio.isEmpty {
+ emit(.speaking)
+ do {
+ try await audioPlayback.play(ttsAudio)
+ } catch is AudioPlaybackError {
+ logger.info("TTS playback interrupted by user")
+ }
+ }
}
-
- // Emit complete result
- emit(.turnCompleted(
- transcript: result.transcription ?? "",
- response: result.response ?? "",
- audio: result.synthesizedAudio
- ))
-
} catch {
logger.error("Processing failed: \(error)")
emit(.error(error.localizedDescription))
}
+ emit(.turnCompleted(
+ transcript: transcription,
+ response: cleanedResponse,
+ thinkingContent: thinkingContent,
+ audio: synthesizedAudio
+ ))
+
// Resume listening if continuous mode
if config.continuousMode && isRunning {
try? await startListening()
diff --git a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift
index da6c8ff1e..47f30a0d4 100644
--- a/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift
+++ b/sdk/runanywhere-swift/Sources/RunAnywhere/Public/Extensions/VoiceAgent/VoiceAgentTypes.swift
@@ -23,6 +23,9 @@ public struct VoiceAgentResult: Sendable {
/// Generated response text from LLM
public var response: String?
+ /// Thinking content extracted from `...` tags (nil if none)
+ public var thinkingContent: String?
+
/// Synthesized audio data from TTS
public var synthesizedAudio: Data?
@@ -31,11 +34,13 @@ public struct VoiceAgentResult: Sendable {
speechDetected: Bool = false,
transcription: String? = nil,
response: String? = nil,
+ thinkingContent: String? = nil,
synthesizedAudio: Data? = nil
) {
self.speechDetected = speechDetected
self.transcription = transcription
self.response = response
+ self.thinkingContent = thinkingContent
self.synthesizedAudio = synthesizedAudio
}
@@ -185,14 +190,14 @@ public enum VoiceSessionEvent: Sendable {
/// Got transcription from STT
case transcribed(text: String)
- /// Got response from LLM
- case responded(text: String)
+ /// Got response from LLM (with optional thinking content)
+ case responded(text: String, thinkingContent: String? = nil)
/// Playing TTS audio
case speaking
- /// Complete turn result
- case turnCompleted(transcript: String, response: String, audio: Data?)
+ /// Complete turn result (with optional thinking content)
+ case turnCompleted(transcript: String, response: String, thinkingContent: String? = nil, audio: Data?)
/// Session stopped
case stopped
@@ -217,16 +222,26 @@ public struct VoiceSessionConfig: Sendable {
/// Whether to auto-resume listening after TTS playback
public var continuousMode: Bool
+ /// Whether thinking mode is enabled for the LLM.
+ public var thinkingModeEnabled: Bool
+
+ /// Maximum tokens for LLM generation (nil uses SDK default of 100)
+ public var maxTokens: Int?
+
public init(
silenceDuration: TimeInterval = 1.5,
speechThreshold: Float = 0.1,
autoPlayTTS: Bool = true,
- continuousMode: Bool = true
+ continuousMode: Bool = true,
+ thinkingModeEnabled: Bool = false,
+ maxTokens: Int? = nil
) {
self.silenceDuration = silenceDuration
self.speechThreshold = speechThreshold
self.autoPlayTTS = autoPlayTTS
self.continuousMode = continuousMode
+ self.thinkingModeEnabled = thinkingModeEnabled
+ self.maxTokens = maxTokens
}
/// Default configuration