feat(api): api update

stainless-app[bot] · stainless-app[bot] · commit f8a3e9cec40f · 2025-10-30T15:29:50.000Z
diff --git a/.stats.yml b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 41
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-d77958642fd9231b5b67d43b97bf6ee9716eff4fad6d9d0ab0bedfd2a10377e5.yml
-openapi_spec_hash: ddb6e3f4533d6bc75570ca13656c557f
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/togetherai%2Ftogetherai-f53d9282224f2c3943d83d014d64ba61271f3aedef59197cc4dae0102d2b365d.yml
+openapi_spec_hash: a884fe7d04e9f64675e3943a962ebb65
 config_hash: 73457be4d72f0bf4c22de49f2b2d4ec3
diff --git a/src/resources/audio/audio.ts b/src/resources/audio/audio.ts
@@ -22,8 +22,8 @@ export class Audio extends APIResource {
    * ```ts
    * const audio = await client.audio.create({
    *   input: 'input',
-   *   model: 'cartesia/sonic',
-   *   voice: 'laidback woman',
+   *   model: 'canopylabs/orpheus-3b-0.1-ft',
+   *   voice: 'voice',
    * });
    *
    * const content = await audio.blob();
@@ -88,17 +88,26 @@ export interface AudioCreateParamsBase {
    * The name of the model to query.
    *
    * [See all of Together AI's chat models](https://docs.together.ai/docs/serverless-models#audio-models)
+   * The current supported tts models are: - cartesia/sonic - hexgrad/Kokoro-82M -
+   * canopylabs/orpheus-3b-0.1-ft
    */
-  model: 'cartesia/sonic' | (string & {});
+  model: 'cartesia/sonic' | 'hexgrad/Kokoro-82M' | 'canopylabs/orpheus-3b-0.1-ft' | (string & {});
 
   /**
-   * The voice to use for generating the audio.
+   * The voice to use for generating the audio. The voices supported are different
+   * for each model. For eg - for canopylabs/orpheus-3b-0.1-ft, one of the voices
+   * supported is tara, for hexgrad/Kokoro-82M, one of the voices supported is
+   * af_alloy and for cartesia/sonic, one of the voices supported is "friendly
+   * sidekick".
+   *
+   * You can view the voices supported for each model using the /v1/voices endpoint
+   * sending the model name as the query parameter.
    * [View all supported voices here](https://docs.together.ai/docs/text-to-speech#voices-available).
    */
-  voice: 'laidback woman' | 'polite man' | 'storyteller lady' | 'friendly sidekick' | (string & {});
+  voice: string;
 
   /**
-   * Language of input text
+   * Language of input text.
    */
   language?:
     | 'en'
@@ -123,12 +132,15 @@ export interface AudioCreateParamsBase {
   response_encoding?: 'pcm_f32le' | 'pcm_s16le' | 'pcm_mulaw' | 'pcm_alaw';
 
   /**
-   * The format of audio output
+   * The format of audio output. Supported formats are mp3, wav, raw if streaming is
+   * false. If streaming is true, the only supported format is raw.
    */
   response_format?: 'mp3' | 'wav' | 'raw';
 
   /**
-   * Sampling rate to use for the output audio
+   * Sampling rate to use for the output audio. The default sampling rate for
+   * canopylabs/orpheus-3b-0.1-ft and hexgrad/Kokoro-82M is 24000 and for
+   * cartesia/sonic is 44100.
    */
   sample_rate?: number;
 
diff --git a/src/resources/audio/transcriptions.ts b/src/resources/audio/transcriptions.ts
@@ -64,6 +64,11 @@ export namespace TranscriptionCreateResponse {
      */
     text: string;
 
+    /**
+     * Array of transcription speaker segments (only when diarize is enabled)
+     */
+    speaker_segments?: Array<AudioTranscriptionVerboseJsonResponse.SpeakerSegment>;
+
     /**
      * Array of transcription words (only when timestamp_granularities includes 'word')
      */
@@ -93,6 +98,62 @@ export namespace TranscriptionCreateResponse {
       text: string;
     }
 
+    export interface SpeakerSegment {
+      /**
+       * Unique identifier for the speaker segment
+       */
+      id: number;
+
+      /**
+       * End time of the speaker segment in seconds
+       */
+      end: number;
+
+      /**
+       * The speaker identifier
+       */
+      speaker_id: string;
+
+      /**
+       * Start time of the speaker segment in seconds
+       */
+      start: number;
+
+      /**
+       * The full text spoken by this speaker in this segment
+       */
+      text: string;
+
+      /**
+       * Array of words spoken by this speaker in this segment
+       */
+      words: Array<SpeakerSegment.Word>;
+    }
+
+    export namespace SpeakerSegment {
+      export interface Word {
+        /**
+         * End time of the word in seconds
+         */
+        end: number;
+
+        /**
+         * Start time of the word in seconds
+         */
+        start: number;
+
+        /**
+         * The word
+         */
+        word: string;
+
+        /**
+         * The speaker id for the word (only when diarize is enabled)
+         */
+        speaker_id?: string;
+      }
+    }
+
     export interface Word {
       /**
        * End time of the word in seconds
@@ -108,6 +169,11 @@ export namespace TranscriptionCreateResponse {
        * The word
        */
       word: string;
+
+      /**
+       * The speaker id for the word (only when diarize is enabled)
+       */
+      speaker_id?: string;
     }
   }
 }
@@ -118,6 +184,20 @@ export interface TranscriptionCreateParams {
    */
   file: Uploadable;
 
+  /**
+   * Whether to enable speaker diarization. When enabled, you will get the speaker id
+   * for each word in the transcription. In the response, in the words array, you
+   * will get the speaker id for each word. In addition, we also return the
+   * speaker_segments array which contains the speaker id for each speaker segment
+   * along with the start and end time of the segment along with all the words in the
+   * segment.
+   *
+   * For eg - ... "speaker_segments": [ "speaker_id": "SPEAKER_00", "start": 0,
+   * "end": 30.02, "words": [ { "id": 0, "word": "Tijana", "start": 0, "end": 11.475,
+   * "speaker_id": "SPEAKER_00" }, ...
+   */
+  diarize?: boolean;
+
   /**
    * Optional ISO 639-1 language code. If `auto` is provided, language is
    * auto-detected.
diff --git a/src/resources/audio/translations.ts b/src/resources/audio/translations.ts
@@ -107,6 +107,11 @@ export namespace TranslationCreateResponse {
        * The word
        */
       word: string;
+
+      /**
+       * The speaker id for the word (only when diarize is enabled)
+       */
+      speaker_id?: string;
     }
   }
 }
diff --git a/tests/api-resources/audio/audio.test.ts b/tests/api-resources/audio/audio.test.ts
@@ -11,8 +11,8 @@ describe('resource audio', () => {
   test('create: required and optional params', async () => {
     const response = await client.audio.create({
       input: 'input',
-      model: 'cartesia/sonic',
-      voice: 'laidback woman',
+      model: 'canopylabs/orpheus-3b-0.1-ft',
+      voice: 'voice',
       language: 'en',
       response_encoding: 'pcm_f32le',
       response_format: 'mp3',
diff --git a/tests/api-resources/audio/transcriptions.test.ts b/tests/api-resources/audio/transcriptions.test.ts
@@ -24,6 +24,7 @@ describe('resource transcriptions', () => {
   test('create: required and optional params', async () => {
     const response = await client.audio.transcriptions.create({
       file: await toFile(Buffer.from('# my file contents'), 'README.md'),
+      diarize: true,
       language: 'en',
       model: 'openai/whisper-large-v3',
       prompt: 'prompt',

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,11 @@ export namespace TranslationCreateResponse {`
`107`	`107`	`* The word`
`108`	`108`	`*/`
`109`	`109`	`word: string;`
	`110`	`+`
	`111`	`+ /**`
	`112`	`+ * The speaker id for the word (only when diarize is enabled)`
	`113`	`+ */`
	`114`	`+ speaker_id?: string;`
`110`	`115`	`}`
`111`	`116`	`}`
`112`	`117`	`}`