wip

m1guelpf · m1guelpf · commit c03e505b2679 · 2025-09-01T12:13:02.000+01:00
diff --git a/Sources/Core/Models/Model.swift b/Sources/Core/Models/Model.swift
@@ -16,3 +16,12 @@ public enum Model: RawRepresentable, Equatable, Hashable, Codable, Sendable {
 		}
 	}
 }
+
+public extension Model {
+	enum Transcription: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
+		case whisper = "whisper-1"
+		case gpt4o = "gpt-4o-transcribe-latest"
+		case gpt4oMini = "gpt-4o-mini-transcribe"
+		case gpt4oDiarize = "gpt-4o-transcribe-diarize"
+	}
+}
diff --git a/Sources/Core/Models/Session.swift b/Sources/Core/Models/Session.swift
@@ -51,7 +51,7 @@ import HelperCoders
 	}
 
 	public enum Voice: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
-		case alloy, ash, ballad, coral, echo, sage, shimmer, verse
+		case alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, cedar
 	}
 
 	/// The format of input audio.
@@ -60,108 +60,196 @@ import HelperCoders
 		public var type: String
 	}
 
+	/// Configuration for input and output audio.
 	public struct Audio: Codable, Equatable, Hashable, Sendable {
+		/// Configuration for input audio.
 		public struct Input: Equatable, Hashable, Codable, Sendable {
-			public struct Transcription: Codable, Equatable, Hashable, Sendable {
-				public enum TranscriptionModel: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
-					case whisper = "whisper-1"
-					case gpt4o = "gpt-4o-transcribe"
-					case gpt4oMini = "gpt-4o-mini-transcribe"
-				}
-
+			/// Configuration for input audio transcription.
+			public struct Transcription: Equatable, Hashable, Codable, Sendable {
 				/// The model to use for transcription
-				public var model: TranscriptionModel
+				public var model: Model.Transcription
 
 				/// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. `en`) format will improve accuracy and latency.
 				public var language: String?
 
 				/// An optional text to guide the model's style or continue a previous audio segment.
 				///
-				/// For `whisper`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt4o` models, the prompt is a free text string, for example "expect words related to technology".
+				/// For `whisper`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
+				/// For `gpt4o` models, the prompt is a free text string, for example "expect words related to technology".
 				public var prompt: String?
 
-				public init(model: TranscriptionModel = .whisper) {
+				public init(model: Model.Transcription = .gpt4o, language: String? = nil, prompt: String? = nil) {
 					self.model = model
+					self.prompt = prompt
+					self.language = language
 				}
 			}
 
-			public struct NoiseReduction: Codable, Equatable, Hashable, Sendable {
-				/// Type of noise reduction.
-				public enum NoiseReductionType: String, CaseIterable, Hashable, Codable, Sendable {
-					/// For close-talking microphones such as headphones
-					case nearField = "near_field"
-
-					/// For far-field microphones such as laptop or conference room microphones
-					case farField = "far_field"
-				}
+			/// Configuration for input audio noise reduction.
+			@Codable @CodedAt("type") public enum NoiseReduction: CaseIterable, Equatable, Hashable, Sendable {
+				/// For close-talking microphones such as headphones
+				@CodedAs("near_field")
+				case nearField
 
-				/// Type of noise reduction.
-				public var type: NoiseReductionType?
-
-				public init(type: NoiseReductionType? = nil) {
-					self.type = type
-				}
+				/// For far-field microphones such as laptop or conference room microphones
+				@CodedAs("far_field")
+				case farField
 			}
 
+			/// Configuration for turn detection
 			public struct TurnDetection: Codable, Equatable, Hashable, Sendable {
+				/// The type of turn detection.
 				public enum TurnDetectionType: String, Codable, Equatable, Hashable, Sendable {
-					case none
 					case serverVad = "server_vad"
 					case semanticVad = "semantic_vad"
 				}
 
+				/// The eagerness of the model to respond.
 				public enum TurnDetectionEagerness: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
 					case auto, low, medium, high
 				}
 
-				/// The type of turn detection.
-				public var type: TurnDetectionType
-				/// Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0).
-				public var threshold: Double?
+				/// Whether or not to automatically generate a response when a VAD stop event occurs.
+				public var createResponse: Bool
+
+				/// Used only for `semanticVad` mode. The eagerness of the model to respond.
+				///
+				/// `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`.
+				public var eagerness: TurnDetectionEagerness?
+
+				/// Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
+				public var idleTimeout: Int?
+
 				/// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
 				public var interruptResponse: Bool?
-				/// Used only for `server_vad` mode. Amount of audio to include before speech starts (in milliseconds).
+
+				/// Used only for `serverVad` mode. Amount of audio to include before speech starts (in milliseconds).
+				///
+				/// Defaults to `300ms`.
 				public var prefixPaddingMs: Int?
-				/// Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds).
+
+				/// Used only for `serverVad` mode. Duration of silence to detect speech stop (in milliseconds).
+				///
+				/// Defaults to `500ms`.
+				///
+				/// With shorter values the model will respond more quickly, but may jump in on short pauses from the user.
 				public var silenceDurationMs: Int?
-				/// Whether or not to automatically generate a response when VAD is enabled.
-				public var createResponse: Bool
-				/// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`.
-				public var eagerness: TurnDetectionEagerness?
 
-				public init(type: TurnDetectionType = .serverVad, threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, createResponse: Bool = true, eagerness: TurnDetectionEagerness? = nil) {
-					self.type = type
-					self.eagerness = eagerness
-					self.threshold = threshold
+				/// Used only for `serverVad` mode. Activation threshold for VAD (0.0 to 1.0).
+				///
+				/// A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments.
+				public var threshold: Double?
+
+				/// The type of turn detection.
+				public var type: TurnDetectionType
+
+				/// Creates a new `TurnDetection` configuration.
+				///
+				/// - Parameter createResponse: Whether or not to automatically generate a response when a VAD stop event occurs.
+				/// - Parameter eagerness: Only for `semanticVad` mode. The eagerness of the model to respond.
+				/// - Parameter idleTimeout: Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
+				/// - Parameter interruptResponse: Whether or not to automatically interrupt any ongoing response with output to the default conversation when a VAD start event occurs.
+				/// - Parameter prefixPaddingMs: Only for `serverVad` mode. Amount of audio to include before speech starts (in milliseconds).
+				/// - Parameter silenceDurationMs: Only for `serverVad` mode. Duration of silence to detect speech stop (in milliseconds).
+				/// - Parameter threshold: Only for `serverVad` mode. Activation threshold for VAD (0.0 to 1.0).
+				/// - Parameter type: The type of turn detection.
+				public init(createResponse: Bool = true, eagerness: TurnDetectionEagerness? = nil, idleTimeout: Int? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, threshold: Double? = nil, type: TurnDetectionType = .serverVad) {
 					self.createResponse = createResponse
+					self.eagerness = eagerness
+					self.idleTimeout = idleTimeout
+					self.interruptResponse = interruptResponse
 					self.prefixPaddingMs = prefixPaddingMs
 					self.silenceDurationMs = silenceDurationMs
-					self.interruptResponse = interruptResponse
+					self.threshold = threshold
+					self.type = type
 				}
 
-				public static func serverVad(threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil) -> TurnDetection {
-					.init(type: .serverVad, threshold: threshold, interruptResponse: interruptResponse, prefixPaddingMs: prefixPaddingMs, silenceDurationMs: silenceDurationMs)
+				/// Creates a new `TurnDetection` configuration for Server VAD.
+				///
+				/// - Parameter createResponse: Whether or not to automatically generate a response when a VAD stop event occurs.
+				/// - Parameter idleTimeout: Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
+				/// - Parameter interruptResponse: Whether or not to automatically interrupt any ongoing response with output to the default conversation when a VAD start event occurs.
+				/// - Parameter prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
+				/// - Parameter silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).
+				/// - Parameter threshold: Activation threshold for VAD (0.0 to 1.0).
+				public static func serverVad(createResponse: Bool = true, idleTimeout: Int? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, threshold: Double? = nil) -> TurnDetection {
+					.init(createResponse: createResponse, eagerness: nil, idleTimeout: idleTimeout, interruptResponse: interruptResponse, prefixPaddingMs: prefixPaddingMs, silenceDurationMs: silenceDurationMs, threshold: threshold, type: .serverVad)
 				}
 
-				public static func semanticVad(eagerness: TurnDetectionEagerness = .auto) -> TurnDetection {
-					.init(type: .semanticVad, eagerness: eagerness)
+				/// Creates a new `TurnDetection` configuration for Semantic VAD.
+				///
+				/// - Parameter createResponse: Whether or not to automatically generate a response when a VAD stop event occurs.
+				/// - Parameter eagerness: The eagerness of the model to respond.
+				/// - Parameter idleTimeout: Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
+				/// - Parameter interruptResponse: Whether or not to automatically interrupt any ongoing response with output to the default conversation when a VAD start event occurs.
+				public static func semanticVad(createResponse: Bool = true, eagerness: TurnDetectionEagerness? = .auto, idleTimeout: Int? = nil, interruptResponse: Bool? = nil) -> TurnDetection {
+					.init(createResponse: createResponse, eagerness: eagerness, idleTimeout: idleTimeout, interruptResponse: interruptResponse, prefixPaddingMs: nil, silenceDurationMs: nil, threshold: nil, type: .semanticVad)
 				}
 			}
 
+			/// The format of input audio.
 			public var format: AudioFormat
+
+			/// Configuration for input audio noise reduction.
+			///
+			/// Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model.
+			///
+			/// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio.
+			public var noiseReduction: NoiseReduction?
+
+			/// Configuration for input audio transcription.
+			///
+			/// Input audio transcription is not native to the model, since the model consumes audio directly.
+			///
+			/// Transcription runs asynchronously through [the `/audio/transcriptions` endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) and should be treated as guidance of input audio content rather than precisely what the model heard.
+			///
+			/// The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service.
 			public var transcription: Transcription?
+
+			/// Configuration for turn detection, either Server VAD or Semantic VAD.
+			///
+			/// Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.
+			///
+			/// Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability.
+			///
+			/// For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking.
+			///
+			/// This can be useful for more natural conversations, but may have a higher latency.
 			public var turnDetection: TurnDetection?
-			public var noiseReduction: NoiseReduction?
 		}
 
+		/// Configuration for output audio.
 		public struct Output: Equatable, Hashable, Codable, Sendable {
+			/// The voice the model uses to respond.
+			///
+			/// Voice cannot be changed during the session once the model has responded with audio at least once.
 			public var voice: Voice
+
+			/// The speed of the model's spoken response.
+			///
+			/// `1.0` is the default speed. `0.25` is the minimum speed. `1.5` is the maximum speed.
+			///
+			/// This value can only be changed in between model turns, not while a response is in progress.
 			public var speed: Double
+
+			/// The format of output audio.
 			public var format: AudioFormat
 		}
 
+		/// Configuration for input audio.
 		public var input: Input
+
+		/// Configuration for output audio.
 		public var output: Output
+
+		/// Creates a new `Audio` configuration.
+		///
+		/// - Parameter input: Configuration for input audio.
+		/// - Parameter output: Configuration for output audio.
+		public init(input: Input, output: Output) {
+			self.input = input
+			self.output = output
+		}
 	}
 
 	public struct Tool: Codable, Equatable, Hashable, Sendable {
@@ -333,6 +421,7 @@ import HelperCoders
 	/// Unique identifier for the session
 	public var id: String?
 
+	/// Configuration for input and output audio.
 	public var audio: Audio
 
 	/// The default system instructions (i.e. system message) prepended to model calls.
@@ -366,7 +455,7 @@ import HelperCoders
 	/// How the model chooses tools.
 	public var toolChoice: ToolChoice?
 
-	/// Tools (functions) available to the model.
+	/// Tools available to the model.
 	public var tools: [Tool]?
 
 	public init(expiresAt: Date, id: String? = nil, audio: Audio, instructions: String, maxResponseOutputTokens: MaxResponseOutputTokens? = nil, modalities: [Modality]? = nil, model: Model, prompt: Prompt? = nil, temperature: Double? = nil, toolChoice: ToolChoice? = nil, tools: [Tool]? = nil) {
diff --git a/Sources/UI/Conversation.swift b/Sources/UI/Conversation.swift
@@ -1,7 +1,6 @@
 import Core
 import WebRTC
 import Foundation
-@preconcurrency import AVFoundation
 
 public enum ConversationError: Error {
 	case sessionNotFound

Original file line number	Diff line number	Diff line change
`@@ -16,3 +16,12 @@ public enum Model: RawRepresentable, Equatable, Hashable, Codable, Sendable {`
`16`	`16`	`}`
`17`	`17`	`}`
`18`	`18`	`}`
	`19`	`+`
	`20`	`+public extension Model {`
	`21`	`+ enum Transcription: String, CaseIterable, Equatable, Hashable, Codable, Sendable {`
	`22`	`+ case whisper = "whisper-1"`
	`23`	`+ case gpt4o = "gpt-4o-transcribe-latest"`
	`24`	`+ case gpt4oMini = "gpt-4o-mini-transcribe"`
	`25`	`+ case gpt4oDiarize = "gpt-4o-transcribe-diarize"`
	`26`	`+ }`
	`27`	`+}`