Skip to content

Commit be4eb4a

Browse files
feat(api): Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint
1 parent 2224817 commit be4eb4a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1301
-90
lines changed

.stats.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 135
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
3-
openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
4-
config_hash: f0940d0906846178759ef7128e4cb98e
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
3+
openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
4+
config_hash: 03b48e9b8c7231a902403210dbd7dfa0

lib/openai.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,14 @@
7979
require_relative "openai/models/audio/transcription"
8080
require_relative "openai/models/audio/transcription_create_params"
8181
require_relative "openai/models/audio/transcription_create_response"
82+
require_relative "openai/models/audio/transcription_diarized"
83+
require_relative "openai/models/audio/transcription_diarized_segment"
8284
require_relative "openai/models/audio/transcription_include"
8385
require_relative "openai/models/audio/transcription_segment"
8486
require_relative "openai/models/audio/transcription_stream_event"
8587
require_relative "openai/models/audio/transcription_text_delta_event"
8688
require_relative "openai/models/audio/transcription_text_done_event"
89+
require_relative "openai/models/audio/transcription_text_segment_event"
8790
require_relative "openai/models/audio/transcription_verbose"
8891
require_relative "openai/models/audio/transcription_word"
8992
require_relative "openai/models/audio/translation"

lib/openai/models/audio/transcription_create_params.rb

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
1919

2020
# @!attribute model
2121
# ID of the model to use. The options are `gpt-4o-transcribe`,
22-
# `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
23-
# Whisper V2 model).
22+
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
23+
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
2424
#
2525
# @return [String, Symbol, OpenAI::Models::AudioModel]
2626
required :model, union: -> { OpenAI::Audio::TranscriptionCreateParams::Model }
@@ -30,6 +30,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
3030
# first normalizes loudness and then uses voice activity detection (VAD) to choose
3131
# boundaries. `server_vad` object can be provided to tweak VAD detection
3232
# parameters manually. If unset, the audio is transcribed as a single block.
33+
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
34+
# seconds.
3335
#
3436
# @return [Symbol, :auto, OpenAI::Models::Audio::TranscriptionCreateParams::ChunkingStrategy::VadConfig, nil]
3537
optional :chunking_strategy,
@@ -41,11 +43,30 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
4143
# return the log probabilities of the tokens in the response to understand the
4244
# model's confidence in the transcription. `logprobs` only works with
4345
# response_format set to `json` and only with the models `gpt-4o-transcribe` and
44-
# `gpt-4o-mini-transcribe`.
46+
# `gpt-4o-mini-transcribe`. This field is not supported when using
47+
# `gpt-4o-transcribe-diarize`.
4548
#
4649
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>, nil]
4750
optional :include, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionInclude] }
4851

52+
# @!attribute known_speaker_names
53+
# Optional list of speaker names that correspond to the audio samples provided in
54+
# `known_speaker_references[]`. Each entry should be a short identifier (for
55+
# example `customer` or `agent`). Up to 4 speakers are supported.
56+
#
57+
# @return [Array<String>, nil]
58+
optional :known_speaker_names, OpenAI::Internal::Type::ArrayOf[String]
59+
60+
# @!attribute known_speaker_references
61+
# Optional list of audio samples (as
62+
# [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
63+
# that contain known speaker references matching `known_speaker_names[]`. Each
64+
# sample must be between 2 and 10 seconds, and can use any of the same input audio
65+
# formats supported by `file`.
66+
#
67+
# @return [Array<String>, nil]
68+
optional :known_speaker_references, OpenAI::Internal::Type::ArrayOf[String]
69+
4970
# @!attribute language
5071
# The language of the input audio. Supplying the input language in
5172
# [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -58,15 +79,18 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
5879
# An optional text to guide the model's style or continue a previous audio
5980
# segment. The
6081
# [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
61-
# should match the audio language.
82+
# should match the audio language. This field is not supported when using
83+
# `gpt-4o-transcribe-diarize`.
6284
#
6385
# @return [String, nil]
6486
optional :prompt, String
6587

6688
# @!attribute response_format
6789
# The format of the output, in one of these options: `json`, `text`, `srt`,
68-
# `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
69-
# the only supported format is `json`.
90+
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
91+
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
92+
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
93+
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
7094
#
7195
# @return [Symbol, OpenAI::Models::AudioResponseFormat, nil]
7296
optional :response_format, enum: -> { OpenAI::AudioResponseFormat }
@@ -86,13 +110,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
86110
# `response_format` must be set `verbose_json` to use timestamp granularities.
87111
# Either or both of these options are supported: `word`, or `segment`. Note: There
88112
# is no additional latency for segment timestamps, but generating word timestamps
89-
# incurs additional latency.
113+
# incurs additional latency. This option is not available for
114+
# `gpt-4o-transcribe-diarize`.
90115
#
91116
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionCreateParams::TimestampGranularity>, nil]
92117
optional :timestamp_granularities,
93118
-> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionCreateParams::TimestampGranularity] }
94119

95-
# @!method initialize(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
120+
# @!method initialize(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
96121
# Some parameter documentations has been truncated, see
97122
# {OpenAI::Models::Audio::TranscriptionCreateParams} for more details.
98123
#
@@ -104,6 +129,10 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
104129
#
105130
# @param include [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>] Additional information to include in the transcription response.
106131
#
132+
# @param known_speaker_names [Array<String>] Optional list of speaker names that correspond to the audio samples provided in
133+
#
134+
# @param known_speaker_references [Array<String>] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en-
135+
#
107136
# @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt
108137
#
109138
# @param prompt [String] An optional text to guide the model's style or continue a previous audio segment
@@ -117,14 +146,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
117146
# @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}]
118147

119148
# ID of the model to use. The options are `gpt-4o-transcribe`,
120-
# `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
121-
# Whisper V2 model).
149+
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
150+
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
122151
module Model
123152
extend OpenAI::Internal::Type::Union
124153

125154
variant String
126155

127-
# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model).
156+
# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source Whisper V2 model), and `gpt-4o-transcribe-diarize`.
128157
variant enum: -> { OpenAI::AudioModel }
129158

130159
# @!method self.variants
@@ -135,6 +164,8 @@ module Model
135164
# first normalizes loudness and then uses voice activity detection (VAD) to choose
136165
# boundaries. `server_vad` object can be provided to tweak VAD detection
137166
# parameters manually. If unset, the audio is transcribed as a single block.
167+
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
168+
# seconds.
138169
module ChunkingStrategy
139170
extend OpenAI::Internal::Type::Union
140171

lib/openai/models/audio/transcription_create_response.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,14 @@ module TranscriptionCreateResponse
1515
# Represents a transcription response returned by model, based on the provided input.
1616
variant -> { OpenAI::Audio::Transcription }
1717

18+
# Represents a diarized transcription response returned by the model, including the combined transcript and speaker-segment annotations.
19+
variant -> { OpenAI::Audio::TranscriptionDiarized }
20+
1821
# Represents a verbose json transcription response returned by model, based on the provided input.
1922
variant -> { OpenAI::Audio::TranscriptionVerbose }
2023

2124
# @!method self.variants
22-
# @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose)]
25+
# @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose)]
2326
end
2427
end
2528
end
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
# frozen_string_literal: true
2+
3+
module OpenAI
4+
module Models
5+
module Audio
6+
class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel
7+
# @!attribute duration
8+
# Duration of the input audio in seconds.
9+
#
10+
# @return [Float]
11+
required :duration, Float
12+
13+
# @!attribute segments
14+
# Segments of the transcript annotated with timestamps and speaker labels.
15+
#
16+
# @return [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>]
17+
required :segments, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionDiarizedSegment] }
18+
19+
# @!attribute task
20+
# The type of task that was run. Always `transcribe`.
21+
#
22+
# @return [Symbol, :transcribe]
23+
required :task, const: :transcribe
24+
25+
# @!attribute text
26+
# The concatenated transcript text for the entire audio input.
27+
#
28+
# @return [String]
29+
required :text, String
30+
31+
# @!attribute usage
32+
# Token or duration usage statistics for the request.
33+
#
34+
# @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration, nil]
35+
optional :usage, union: -> { OpenAI::Audio::TranscriptionDiarized::Usage }
36+
37+
# @!method initialize(duration:, segments:, text:, usage: nil, task: :transcribe)
38+
# Represents a diarized transcription response returned by the model, including
39+
# the combined transcript and speaker-segment annotations.
40+
#
41+
# @param duration [Float] Duration of the input audio in seconds.
42+
#
43+
# @param segments [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>] Segments of the transcript annotated with timestamps and speaker labels.
44+
#
45+
# @param text [String] The concatenated transcript text for the entire audio input.
46+
#
47+
# @param usage [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration] Token or duration usage statistics for the request.
48+
#
49+
# @param task [Symbol, :transcribe] The type of task that was run. Always `transcribe`.
50+
51+
# Token or duration usage statistics for the request.
52+
#
53+
# @see OpenAI::Models::Audio::TranscriptionDiarized#usage
54+
module Usage
55+
extend OpenAI::Internal::Type::Union
56+
57+
discriminator :type
58+
59+
# Usage statistics for models billed by token usage.
60+
variant :tokens, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens }
61+
62+
# Usage statistics for models billed by audio input duration.
63+
variant :duration, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Duration }
64+
65+
class Tokens < OpenAI::Internal::Type::BaseModel
66+
# @!attribute input_tokens
67+
# Number of input tokens billed for this request.
68+
#
69+
# @return [Integer]
70+
required :input_tokens, Integer
71+
72+
# @!attribute output_tokens
73+
# Number of output tokens generated.
74+
#
75+
# @return [Integer]
76+
required :output_tokens, Integer
77+
78+
# @!attribute total_tokens
79+
# Total number of tokens used (input + output).
80+
#
81+
# @return [Integer]
82+
required :total_tokens, Integer
83+
84+
# @!attribute type
85+
# The type of the usage object. Always `tokens` for this variant.
86+
#
87+
# @return [Symbol, :tokens]
88+
required :type, const: :tokens
89+
90+
# @!attribute input_token_details
91+
# Details about the input tokens billed for this request.
92+
#
93+
# @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, nil]
94+
optional :input_token_details,
95+
-> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails }
96+
97+
# @!method initialize(input_tokens:, output_tokens:, total_tokens:, input_token_details: nil, type: :tokens)
98+
# Usage statistics for models billed by token usage.
99+
#
100+
# @param input_tokens [Integer] Number of input tokens billed for this request.
101+
#
102+
# @param output_tokens [Integer] Number of output tokens generated.
103+
#
104+
# @param total_tokens [Integer] Total number of tokens used (input + output).
105+
#
106+
# @param input_token_details [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails] Details about the input tokens billed for this request.
107+
#
108+
# @param type [Symbol, :tokens] The type of the usage object. Always `tokens` for this variant.
109+
110+
# @see OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens#input_token_details
111+
class InputTokenDetails < OpenAI::Internal::Type::BaseModel
112+
# @!attribute audio_tokens
113+
# Number of audio tokens billed for this request.
114+
#
115+
# @return [Integer, nil]
116+
optional :audio_tokens, Integer
117+
118+
# @!attribute text_tokens
119+
# Number of text tokens billed for this request.
120+
#
121+
# @return [Integer, nil]
122+
optional :text_tokens, Integer
123+
124+
# @!method initialize(audio_tokens: nil, text_tokens: nil)
125+
# Details about the input tokens billed for this request.
126+
#
127+
# @param audio_tokens [Integer] Number of audio tokens billed for this request.
128+
#
129+
# @param text_tokens [Integer] Number of text tokens billed for this request.
130+
end
131+
end
132+
133+
class Duration < OpenAI::Internal::Type::BaseModel
134+
# @!attribute seconds
135+
# Duration of the input audio in seconds.
136+
#
137+
# @return [Float]
138+
required :seconds, Float
139+
140+
# @!attribute type
141+
# The type of the usage object. Always `duration` for this variant.
142+
#
143+
# @return [Symbol, :duration]
144+
required :type, const: :duration
145+
146+
# @!method initialize(seconds:, type: :duration)
147+
# Usage statistics for models billed by audio input duration.
148+
#
149+
# @param seconds [Float] Duration of the input audio in seconds.
150+
#
151+
# @param type [Symbol, :duration] The type of the usage object. Always `duration` for this variant.
152+
end
153+
154+
# @!method self.variants
155+
# @return [Array(OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration)]
156+
end
157+
end
158+
end
159+
end
160+
end
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# frozen_string_literal: true
2+
3+
module OpenAI
4+
module Models
5+
module Audio
6+
class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel
7+
# @!attribute id
8+
# Unique identifier for the segment.
9+
#
10+
# @return [String]
11+
required :id, String
12+
13+
# @!attribute end_
14+
# End timestamp of the segment in seconds.
15+
#
16+
# @return [Float]
17+
required :end_, Float, api_name: :end
18+
19+
# @!attribute speaker
20+
# Speaker label for this segment. When known speakers are provided, the label
21+
# matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially
22+
# using capital letters (`A`, `B`, ...).
23+
#
24+
# @return [String]
25+
required :speaker, String
26+
27+
# @!attribute start
28+
# Start timestamp of the segment in seconds.
29+
#
30+
# @return [Float]
31+
required :start, Float
32+
33+
# @!attribute text
34+
# Transcript text for this segment.
35+
#
36+
# @return [String]
37+
required :text, String
38+
39+
# @!attribute type
40+
# The type of the segment. Always `transcript.text.segment`.
41+
#
42+
# @return [Symbol, :"transcript.text.segment"]
43+
required :type, const: :"transcript.text.segment"
44+
45+
# @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment")
46+
# Some parameter documentations has been truncated, see
47+
# {OpenAI::Models::Audio::TranscriptionDiarizedSegment} for more details.
48+
#
49+
# A segment of diarized transcript text with speaker metadata.
50+
#
51+
# @param id [String] Unique identifier for the segment.
52+
#
53+
# @param end_ [Float] End timestamp of the segment in seconds.
54+
#
55+
# @param speaker [String] Speaker label for this segment. When known speakers are provided, the label matc
56+
#
57+
# @param start [Float] Start timestamp of the segment in seconds.
58+
#
59+
# @param text [String] Transcript text for this segment.
60+
#
61+
# @param type [Symbol, :"transcript.text.segment"] The type of the segment. Always `transcript.text.segment`.
62+
end
63+
end
64+
end
65+
end

0 commit comments

Comments
 (0)