Skip to content

Commit c03e505

Browse files
committed
wip
1 parent a14b9f1 commit c03e505

File tree

3 files changed

+146
-49
lines changed

3 files changed

+146
-49
lines changed

Sources/Core/Models/Model.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,12 @@ public enum Model: RawRepresentable, Equatable, Hashable, Codable, Sendable {
1616
}
1717
}
1818
}
19+
20+
public extension Model {
21+
enum Transcription: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
22+
case whisper = "whisper-1"
23+
case gpt4o = "gpt-4o-transcribe-latest"
24+
case gpt4oMini = "gpt-4o-mini-transcribe"
25+
case gpt4oDiarize = "gpt-4o-transcribe-diarize"
26+
}
27+
}

Sources/Core/Models/Session.swift

Lines changed: 137 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ import HelperCoders
5151
}
5252

5353
public enum Voice: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
54-
case alloy, ash, ballad, coral, echo, sage, shimmer, verse
54+
case alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, cedar
5555
}
5656

5757
/// The format of input audio.
@@ -60,108 +60,196 @@ import HelperCoders
6060
public var type: String
6161
}
6262

63+
/// Configuration for input and output audio.
6364
public struct Audio: Codable, Equatable, Hashable, Sendable {
65+
/// Configuration for input audio.
6466
public struct Input: Equatable, Hashable, Codable, Sendable {
65-
public struct Transcription: Codable, Equatable, Hashable, Sendable {
66-
public enum TranscriptionModel: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
67-
case whisper = "whisper-1"
68-
case gpt4o = "gpt-4o-transcribe"
69-
case gpt4oMini = "gpt-4o-mini-transcribe"
70-
}
71-
67+
/// Configuration for input audio transcription.
68+
public struct Transcription: Equatable, Hashable, Codable, Sendable {
7269
/// The model to use for transcription
73-
public var model: TranscriptionModel
70+
public var model: Model.Transcription
7471

7572
/// The language of the input audio. Supplying the input language in ISO-639-1 (e.g. `en`) format will improve accuracy and latency.
7673
public var language: String?
7774

7875
/// An optional text to guide the model's style or continue a previous audio segment.
7976
///
80-
/// For `whisper`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). For `gpt4o` models, the prompt is a free text string, for example "expect words related to technology".
77+
/// For `whisper`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
78+
/// For `gpt4o` models, the prompt is a free text string, for example "expect words related to technology".
8179
public var prompt: String?
8280

83-
public init(model: TranscriptionModel = .whisper) {
81+
public init(model: Model.Transcription = .gpt4o, language: String? = nil, prompt: String? = nil) {
8482
self.model = model
83+
self.prompt = prompt
84+
self.language = language
8585
}
8686
}
8787

88-
public struct NoiseReduction: Codable, Equatable, Hashable, Sendable {
89-
/// Type of noise reduction.
90-
public enum NoiseReductionType: String, CaseIterable, Hashable, Codable, Sendable {
91-
/// For close-talking microphones such as headphones
92-
case nearField = "near_field"
93-
94-
/// For far-field microphones such as laptop or conference room microphones
95-
case farField = "far_field"
96-
}
88+
/// Configuration for input audio noise reduction.
89+
@Codable @CodedAt("type") public enum NoiseReduction: CaseIterable, Equatable, Hashable, Sendable {
90+
/// For close-talking microphones such as headphones
91+
@CodedAs("near_field")
92+
case nearField
9793

98-
/// Type of noise reduction.
99-
public var type: NoiseReductionType?
100-
101-
public init(type: NoiseReductionType? = nil) {
102-
self.type = type
103-
}
94+
/// For far-field microphones such as laptop or conference room microphones
95+
@CodedAs("far_field")
96+
case farField
10497
}
10598

99+
/// Configuration for turn detection
106100
public struct TurnDetection: Codable, Equatable, Hashable, Sendable {
101+
/// The type of turn detection.
107102
public enum TurnDetectionType: String, Codable, Equatable, Hashable, Sendable {
108-
case none
109103
case serverVad = "server_vad"
110104
case semanticVad = "semantic_vad"
111105
}
112106

107+
/// The eagerness of the model to respond.
113108
public enum TurnDetectionEagerness: String, CaseIterable, Equatable, Hashable, Codable, Sendable {
114109
case auto, low, medium, high
115110
}
116111

117-
/// The type of turn detection.
118-
public var type: TurnDetectionType
119-
/// Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0).
120-
public var threshold: Double?
112+
/// Whether or not to automatically generate a response when a VAD stop event occurs.
113+
public var createResponse: Bool
114+
115+
/// Used only for `semanticVad` mode. The eagerness of the model to respond.
116+
///
117+
/// `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`.
118+
public var eagerness: TurnDetectionEagerness?
119+
120+
/// Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
121+
public var idleTimeout: Int?
122+
121123
/// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs.
122124
public var interruptResponse: Bool?
123-
/// Used only for `server_vad` mode. Amount of audio to include before speech starts (in milliseconds).
125+
126+
/// Used only for `serverVad` mode. Amount of audio to include before speech starts (in milliseconds).
127+
///
128+
/// Defaults to `300ms`.
124129
public var prefixPaddingMs: Int?
125-
/// Used only for `server_vad` mode. Duration of silence to detect speech stop (in milliseconds).
130+
131+
/// Used only for `serverVad` mode. Duration of silence to detect speech stop (in milliseconds).
132+
///
133+
/// Defaults to `500ms`.
134+
///
135+
/// With shorter values the model will respond more quickly, but may jump in on short pauses from the user.
126136
public var silenceDurationMs: Int?
127-
/// Whether or not to automatically generate a response when VAD is enabled.
128-
public var createResponse: Bool
129-
/// Used only for `semantic_vad` mode. The eagerness of the model to respond. `low` will wait longer for the user to continue speaking, `high` will respond more quickly. `auto` is the default and is equivalent to `medium`.
130-
public var eagerness: TurnDetectionEagerness?
131137

132-
public init(type: TurnDetectionType = .serverVad, threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, createResponse: Bool = true, eagerness: TurnDetectionEagerness? = nil) {
133-
self.type = type
134-
self.eagerness = eagerness
135-
self.threshold = threshold
138+
/// Used only for `serverVad` mode. Activation threshold for VAD (0.0 to 1.0).
139+
///
140+
/// A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments.
141+
public var threshold: Double?
142+
143+
/// The type of turn detection.
144+
public var type: TurnDetectionType
145+
146+
/// Creates a new `TurnDetection` configuration.
147+
///
148+
/// - Parameter createResponse: Whether or not to automatically generate a response when a VAD stop event occurs.
149+
/// - Parameter eagerness: Only for `semanticVad` mode. The eagerness of the model to respond.
150+
/// - Parameter idleTimeout: Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
151+
/// - Parameter interruptResponse: Whether or not to automatically interrupt any ongoing response with output to the default conversation when a VAD start event occurs.
152+
/// - Parameter prefixPaddingMs: Only for `serverVad` mode. Amount of audio to include before speech starts (in milliseconds).
153+
/// - Parameter silenceDurationMs: Only for `serverVad` mode. Duration of silence to detect speech stop (in milliseconds).
154+
/// - Parameter threshold: Only for `serverVad` mode. Activation threshold for VAD (0.0 to 1.0).
155+
/// - Parameter type: The type of turn detection.
156+
public init(createResponse: Bool = true, eagerness: TurnDetectionEagerness? = nil, idleTimeout: Int? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, threshold: Double? = nil, type: TurnDetectionType = .serverVad) {
136157
self.createResponse = createResponse
158+
self.eagerness = eagerness
159+
self.idleTimeout = idleTimeout
160+
self.interruptResponse = interruptResponse
137161
self.prefixPaddingMs = prefixPaddingMs
138162
self.silenceDurationMs = silenceDurationMs
139-
self.interruptResponse = interruptResponse
163+
self.threshold = threshold
164+
self.type = type
140165
}
141166

142-
public static func serverVad(threshold: Double? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil) -> TurnDetection {
143-
.init(type: .serverVad, threshold: threshold, interruptResponse: interruptResponse, prefixPaddingMs: prefixPaddingMs, silenceDurationMs: silenceDurationMs)
167+
/// Creates a new `TurnDetection` configuration for Server VAD.
168+
///
169+
/// - Parameter createResponse: Whether or not to automatically generate a response when a VAD stop event occurs.
170+
/// - Parameter idleTimeout: Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
171+
/// - Parameter interruptResponse: Whether or not to automatically interrupt any ongoing response with output to the default conversation when a VAD start event occurs.
172+
/// - Parameter prefixPaddingMs: Amount of audio to include before speech starts (in milliseconds).
173+
/// - Parameter silenceDurationMs: Duration of silence to detect speech stop (in milliseconds).
174+
/// - Parameter threshold: Activation threshold for VAD (0.0 to 1.0).
175+
public static func serverVad(createResponse: Bool = true, idleTimeout: Int? = nil, interruptResponse: Bool? = nil, prefixPaddingMs: Int? = nil, silenceDurationMs: Int? = nil, threshold: Double? = nil) -> TurnDetection {
176+
.init(createResponse: createResponse, eagerness: nil, idleTimeout: idleTimeout, interruptResponse: interruptResponse, prefixPaddingMs: prefixPaddingMs, silenceDurationMs: silenceDurationMs, threshold: threshold, type: .serverVad)
144177
}
145178

146-
public static func semanticVad(eagerness: TurnDetectionEagerness = .auto) -> TurnDetection {
147-
.init(type: .semanticVad, eagerness: eagerness)
179+
/// Creates a new `TurnDetection` configuration for Semantic VAD.
180+
///
181+
/// - Parameter createResponse: Whether or not to automatically generate a response when a VAD stop event occurs.
182+
/// - Parameter eagerness: The eagerness of the model to respond.
183+
/// - Parameter idleTimeout: Optional idle timeout after which turn detection will auto-timeout when no additional audio is received.
184+
/// - Parameter interruptResponse: Whether or not to automatically interrupt any ongoing response with output to the default conversation when a VAD start event occurs.
185+
public static func semanticVad(createResponse: Bool = true, eagerness: TurnDetectionEagerness? = .auto, idleTimeout: Int? = nil, interruptResponse: Bool? = nil) -> TurnDetection {
186+
.init(createResponse: createResponse, eagerness: eagerness, idleTimeout: idleTimeout, interruptResponse: interruptResponse, prefixPaddingMs: nil, silenceDurationMs: nil, threshold: nil, type: .semanticVad)
148187
}
149188
}
150189

190+
/// The format of input audio.
151191
public var format: AudioFormat
192+
193+
/// Configuration for input audio noise reduction.
194+
///
195+
/// Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model.
196+
///
197+
/// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio.
198+
public var noiseReduction: NoiseReduction?
199+
200+
/// Configuration for input audio transcription.
201+
///
202+
/// Input audio transcription is not native to the model, since the model consumes audio directly.
203+
///
204+
/// Transcription runs asynchronously through [the `/audio/transcriptions` endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription) and should be treated as guidance of input audio content rather than precisely what the model heard.
205+
///
206+
/// The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service.
152207
public var transcription: Transcription?
208+
209+
/// Configuration for turn detection, either Server VAD or Semantic VAD.
210+
///
211+
/// Server VAD means that the model will detect the start and end of speech based on audio volume and respond at the end of user speech.
212+
///
213+
/// Semantic VAD is more advanced and uses a turn detection model (in conjunction with VAD) to semantically estimate whether the user has finished speaking, then dynamically sets a timeout based on this probability.
214+
///
215+
/// For example, if user audio trails off with "uhhm", the model will score a low probability of turn end and wait longer for the user to continue speaking.
216+
///
217+
/// This can be useful for more natural conversations, but may have a higher latency.
153218
public var turnDetection: TurnDetection?
154-
public var noiseReduction: NoiseReduction?
155219
}
156220

221+
/// Configuration for output audio.
157222
public struct Output: Equatable, Hashable, Codable, Sendable {
223+
/// The voice the model uses to respond.
224+
///
225+
/// Voice cannot be changed during the session once the model has responded with audio at least once.
158226
public var voice: Voice
227+
228+
/// The speed of the model's spoken response.
229+
///
230+
/// `1.0` is the default speed. `0.25` is the minimum speed. `1.5` is the maximum speed.
231+
///
232+
/// This value can only be changed in between model turns, not while a response is in progress.
159233
public var speed: Double
234+
235+
/// The format of output audio.
160236
public var format: AudioFormat
161237
}
162238

239+
/// Configuration for input audio.
163240
public var input: Input
241+
242+
/// Configuration for output audio.
164243
public var output: Output
244+
245+
/// Creates a new `Audio` configuration.
246+
///
247+
/// - Parameter input: Configuration for input audio.
248+
/// - Parameter output: Configuration for output audio.
249+
public init(input: Input, output: Output) {
250+
self.input = input
251+
self.output = output
252+
}
165253
}
166254

167255
public struct Tool: Codable, Equatable, Hashable, Sendable {
@@ -333,6 +421,7 @@ import HelperCoders
333421
/// Unique identifier for the session
334422
public var id: String?
335423

424+
/// Configuration for input and output audio.
336425
public var audio: Audio
337426

338427
/// The default system instructions (i.e. system message) prepended to model calls.
@@ -366,7 +455,7 @@ import HelperCoders
366455
/// How the model chooses tools.
367456
public var toolChoice: ToolChoice?
368457

369-
/// Tools (functions) available to the model.
458+
/// Tools available to the model.
370459
public var tools: [Tool]?
371460

372461
public init(expiresAt: Date, id: String? = nil, audio: Audio, instructions: String, maxResponseOutputTokens: MaxResponseOutputTokens? = nil, modalities: [Modality]? = nil, model: Model, prompt: Prompt? = nil, temperature: Double? = nil, toolChoice: ToolChoice? = nil, tools: [Tool]? = nil) {

Sources/UI/Conversation.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import Core
22
import WebRTC
33
import Foundation
4-
@preconcurrency import AVFoundation
54

65
public enum ConversationError: Error {
76
case sessionNotFound

0 commit comments

Comments
 (0)