Skip to content

Commit f8a3e9c

Browse files
feat(api): api update
1 parent 04bf089 commit f8a3e9c

File tree

6 files changed

+110
-12
lines changed

6 files changed

+110
-12
lines changed

.stats.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
configured_endpoints: 41
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-d77958642fd9231b5b67d43b97bf6ee9716eff4fad6d9d0ab0bedfd2a10377e5.yml
3-
openapi_spec_hash: ddb6e3f4533d6bc75570ca13656c557f
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-f53d9282224f2c3943d83d014d64ba61271f3aedef59197cc4dae0102d2b365d.yml
3+
openapi_spec_hash: a884fe7d04e9f64675e3943a962ebb65
44
config_hash: 73457be4d72f0bf4c22de49f2b2d4ec3

src/resources/audio/audio.ts

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ export class Audio extends APIResource {
2222
* ```ts
2323
* const audio = await client.audio.create({
2424
* input: 'input',
25-
* model: 'cartesia/sonic',
26-
* voice: 'laidback woman',
25+
* model: 'canopylabs/orpheus-3b-0.1-ft',
26+
* voice: 'voice',
2727
* });
2828
*
2929
* const content = await audio.blob();
@@ -88,17 +88,26 @@ export interface AudioCreateParamsBase {
8888
* The name of the model to query.
8989
*
9090
* [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models)
91+
* The current supported tts models are: - cartesia/sonic - hexgrad/Kokoro-82M -
92+
* canopylabs/orpheus-3b-0.1-ft
9193
*/
92-
model: 'cartesia/sonic' | (string & {});
94+
model: 'cartesia/sonic' | 'hexgrad/Kokoro-82M' | 'canopylabs/orpheus-3b-0.1-ft' | (string & {});
9395

9496
/**
95-
* The voice to use for generating the audio.
97+
* The voice to use for generating the audio. The voices supported are different
98+
* for each model. For eg - for canopylabs/orpheus-3b-0.1-ft, one of the voices
99+
* supported is tara, for hexgrad/Kokoro-82M, one of the voices supported is
100+
* af_alloy and for cartesia/sonic, one of the voices supported is "friendly
101+
* sidekick".
102+
*
103+
* You can view the voices supported for each model using the /v1/voices endpoint
104+
* sending the model name as the query parameter.
96105
* [View all supported voices here](https://docs.together.ai/docs/text-to-speech#voices-available).
97106
*/
98-
voice: 'laidback woman' | 'polite man' | 'storyteller lady' | 'friendly sidekick' | (string & {});
107+
voice: string;
99108

100109
/**
101-
* Language of input text
110+
* Language of input text.
102111
*/
103112
language?:
104113
| 'en'
@@ -123,12 +132,15 @@ export interface AudioCreateParamsBase {
123132
response_encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_mulaw' | 'pcm_alaw';
124133

125134
/**
126-
* The format of audio output
135+
* The format of audio output. Supported formats are mp3, wav, raw if streaming is
136+
* false. If streaming is true, the only supported format is raw.
127137
*/
128138
response_format?: 'mp3' | 'wav' | 'raw';
129139

130140
/**
131-
* Sampling rate to use for the output audio
141+
* Sampling rate to use for the output audio. The default sampling rate for
142+
* canopylabs/orpheus-3b-0.1-ft and hexgrad/Kokoro-82M is 24000 and for
143+
* cartesia/sonic is 44100.
132144
*/
133145
sample_rate?: number;
134146

src/resources/audio/transcriptions.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ export namespace TranscriptionCreateResponse {
6464
*/
6565
text: string;
6666

67+
/**
68+
* Array of transcription speaker segments (only when diarize is enabled)
69+
*/
70+
speaker_segments?: Array<AudioTranscriptionVerboseJsonResponse.SpeakerSegment>;
71+
6772
/**
6873
* Array of transcription words (only when timestamp_granularities includes 'word')
6974
*/
@@ -93,6 +98,62 @@ export namespace TranscriptionCreateResponse {
9398
text: string;
9499
}
95100

101+
export interface SpeakerSegment {
102+
/**
103+
* Unique identifier for the speaker segment
104+
*/
105+
id: number;
106+
107+
/**
108+
* End time of the speaker segment in seconds
109+
*/
110+
end: number;
111+
112+
/**
113+
* The speaker identifier
114+
*/
115+
speaker_id: string;
116+
117+
/**
118+
* Start time of the speaker segment in seconds
119+
*/
120+
start: number;
121+
122+
/**
123+
* The full text spoken by this speaker in this segment
124+
*/
125+
text: string;
126+
127+
/**
128+
* Array of words spoken by this speaker in this segment
129+
*/
130+
words: Array<SpeakerSegment.Word>;
131+
}
132+
133+
export namespace SpeakerSegment {
134+
export interface Word {
135+
/**
136+
* End time of the word in seconds
137+
*/
138+
end: number;
139+
140+
/**
141+
* Start time of the word in seconds
142+
*/
143+
start: number;
144+
145+
/**
146+
* The word
147+
*/
148+
word: string;
149+
150+
/**
151+
* The speaker id for the word (only when diarize is enabled)
152+
*/
153+
speaker_id?: string;
154+
}
155+
}
156+
96157
export interface Word {
97158
/**
98159
* End time of the word in seconds
@@ -108,6 +169,11 @@ export namespace TranscriptionCreateResponse {
108169
* The word
109170
*/
110171
word: string;
172+
173+
/**
174+
* The speaker id for the word (only when diarize is enabled)
175+
*/
176+
speaker_id?: string;
111177
}
112178
}
113179
}
@@ -118,6 +184,20 @@ export interface TranscriptionCreateParams {
118184
*/
119185
file: Uploadable;
120186

187+
/**
188+
* Whether to enable speaker diarization. When enabled, you will get the speaker id
189+
* for each word in the transcription. In the response, in the words array, you
190+
* will get the speaker id for each word. In addition, we also return the
191+
* speaker_segments array which contains the speaker id for each speaker segment
192+
* along with the start and end time of the segment along with all the words in the
193+
* segment.
194+
*
195+
* For eg - ... "speaker_segments": [ "speaker_id": "SPEAKER_00", "start": 0,
196+
* "end": 30.02, "words": [ { "id": 0, "word": "Tijana", "start": 0, "end": 11.475,
197+
* "speaker_id": "SPEAKER_00" }, ...
198+
*/
199+
diarize?: boolean;
200+
121201
/**
122202
* Optional ISO 639-1 language code. If `auto` is provided, language is
123203
* auto-detected.

src/resources/audio/translations.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,11 @@ export namespace TranslationCreateResponse {
107107
* The word
108108
*/
109109
word: string;
110+
111+
/**
112+
* The speaker id for the word (only when diarize is enabled)
113+
*/
114+
speaker_id?: string;
110115
}
111116
}
112117
}

tests/api-resources/audio/audio.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ describe('resource audio', () => {
1111
test('create: required and optional params', async () => {
1212
const response = await client.audio.create({
1313
input: 'input',
14-
model: 'cartesia/sonic',
15-
voice: 'laidback woman',
14+
model: 'canopylabs/orpheus-3b-0.1-ft',
15+
voice: 'voice',
1616
language: 'en',
1717
response_encoding: 'pcm_f32le',
1818
response_format: 'mp3',

tests/api-resources/audio/transcriptions.test.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ describe('resource transcriptions', () => {
2424
test('create: required and optional params', async () => {
2525
const response = await client.audio.transcriptions.create({
2626
file: await toFile(Buffer.from('# my file contents'), 'README.md'),
27+
diarize: true,
2728
language: 'en',
2829
model: 'openai/whisper-large-v3',
2930
prompt: 'prompt',

0 commit comments

Comments
 (0)