Add changes for audio speech and audio transcriptions (#388)

rajasbansal · zainhas · web-flow · commit 9806cf1e7360 · 2025-10-30T20:25:54.000-07:00
* Add changes for audio speech and audio transcriptions

* Remove testing word stuff

* Black formatting'

---------

Co-authored-by: Zain Hasan &lt;zain_has@hotmail.com&gt;
diff --git a/src/together/resources/audio/speech.py b/src/together/resources/audio/speech.py
@@ -30,7 +30,7 @@ def create(
         response_format: str = "wav",
         language: str = "en",
         response_encoding: str = "pcm_f32le",
-        sample_rate: int = 44100,
+        sample_rate: int | None = None,
         stream: bool = False,
         **kwargs: Any,
     ) -> AudioSpeechStreamResponse:
@@ -49,14 +49,20 @@ def create(
             response_encoding (str, optional): Audio encoding of response.
                 Defaults to "pcm_f32le".
             sample_rate (int, optional): Sampling rate to use for the output audio.
-                Defaults to 44100.
+                Defaults to None. If not provided, the default sampling rate for the model will be used.
             stream (bool, optional): If true, output is streamed for several characters at a time.
                 Defaults to False.
 
         Returns:
             Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks.
         """
 
+        if sample_rate is None:
+            if "cartesia" in model:
+                sample_rate = 44100
+            else:
+                sample_rate = 24000
+
         requestor = api_requestor.APIRequestor(
             client=self._client,
         )
diff --git a/src/together/resources/audio/transcriptions.py b/src/together/resources/audio/transcriptions.py
@@ -30,6 +30,7 @@ def create(
         timestamp_granularities: Optional[
             Union[str, AudioTimestampGranularities]
         ] = None,
+        diarize: bool = False,
         **kwargs: Any,
     ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
         """
@@ -52,7 +53,11 @@ def create(
             timestamp_granularities: The timestamp granularities to populate for this
                 transcription. response_format must be set verbose_json to use timestamp
                 granularities. Either or both of these options are supported: word, or segment.
-
+            diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription.
+                In the response, in the words array, you will get the speaker id for each word.
+                In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment.
+                You can use the speaker_id to group the words by speaker.
+                You can use the speaker_segments to get the start and end time of each speaker segment.
         Returns:
             The transcribed text in the requested format.
         """
@@ -103,6 +108,9 @@ def create(
                 else timestamp_granularities
             )
 
+        if diarize:
+            params_data["diarize"] = diarize
+
         # Add any additional kwargs
         # Convert boolean values to lowercase strings for proper form encoding
         for key, value in kwargs.items():
@@ -135,6 +143,7 @@ def create(
         if (
             response_format == "verbose_json"
             or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+            or diarize
         ):
             # Create response with model validation that preserves extra fields
             return AudioTranscriptionVerboseResponse.model_validate(response.data)
@@ -158,6 +167,7 @@ async def create(
         timestamp_granularities: Optional[
             Union[str, AudioTimestampGranularities]
         ] = None,
+        diarize: bool = False,
         **kwargs: Any,
     ) -> Union[AudioTranscriptionResponse, AudioTranscriptionVerboseResponse]:
         """
@@ -180,7 +190,11 @@ async def create(
             timestamp_granularities: The timestamp granularities to populate for this
                 transcription. response_format must be set verbose_json to use timestamp
                 granularities. Either or both of these options are supported: word, or segment.
-
+            diarize: Whether to enable speaker diarization. When enabled, you will get the speaker id for each word in the transcription.
+                In the response, in the words array, you will get the speaker id for each word.
+                In addition, we also return the speaker_segments array which contains the speaker id for each speaker segment along with the start and end time of the segment along with all the words in the segment.
+                You can use the speaker_id to group the words by speaker.
+                You can use the speaker_segments to get the start and end time of each speaker segment.
         Returns:
             The transcribed text in the requested format.
         """
@@ -239,6 +253,9 @@ async def create(
                 )
             )
 
+        if diarize:
+            params_data["diarize"] = diarize
+
         # Add any additional kwargs
         # Convert boolean values to lowercase strings for proper form encoding
         for key, value in kwargs.items():
@@ -271,6 +288,7 @@ async def create(
         if (
             response_format == "verbose_json"
             or response_format == AudioTranscriptionResponseFormat.VERBOSE_JSON
+            or diarize
         ):
             # Create response with model validation that preserves extra fields
             return AudioTranscriptionVerboseResponse.model_validate(response.data)
diff --git a/tests/integration/resources/test_transcriptions.py b/tests/integration/resources/test_transcriptions.py
@@ -36,19 +36,6 @@ def validate_diarization_response(response_dict):
             assert "end" in word
             assert "speaker_id" in word
 
-    # Validate top-level words field
-    assert "words" in response_dict
-    assert isinstance(response_dict["words"], list)
-    assert len(response_dict["words"]) > 0
-
-    # Validate each word in top-level words
-    for word in response_dict["words"]:
-        assert "id" in word
-        assert "word" in word
-        assert "start" in word
-        assert "end" in word
-        assert "speaker_id" in word
-
 
 class TestTogetherTranscriptions:
     @pytest.fixture