Simplified speech samples (set-2) (GoogleCloudPlatform#12120)

* Simplified next set of Samples * Fix nits and added test * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
killkelleyr · Aug 14, 2024 · de4ad98 · de4ad98
1 parent 6d59d2d
commit de4ad98
Show file tree

Hide file tree

Showing 9 changed files with 102 additions and 82 deletions.
diff --git a/speech/snippets/transcribe_async_gcs_test.py b/speech/snippets/transcribe_async_gcs_test.py
@@ -18,13 +18,11 @@
 
 import transcribe_async_gcs
 import transcribe_diarization_gcs_beta
-import transcribe_multilanguage_gcs_beta
 import transcribe_word_level_confidence_gcs_beta
 
 BUCKET = "cloud-samples-data"
 GCS_AUDIO_PATH = "gs://" + BUCKET + "/speech/brooklyn_bridge.flac"
 GCS_DIARIZATION_AUDIO_PATH = "gs://" + BUCKET + "/speech/commercial_mono.wav"
-GCS_MUTLILANGUAGE_PATH = "gs://" + BUCKET + "/speech/Google_Gnome.wav"
 
 
 @Retry()
@@ -40,15 +38,6 @@ def test_transcribe_diarization_gcs_beta() -> None:
     assert is_completed
 
 
-def test_transcribe_multilanguage_gcs_bets() -> None:
-    transcript = (
-        transcribe_multilanguage_gcs_beta.transcribe_file_with_multilanguage_gcs(
-            GCS_MUTLILANGUAGE_PATH
-        )
-    )
-    assert re.search("Transcript: OK Google", transcript)
-
-
 def test_transcribe_word_level_confidence_gcs_beta() -> None:
     transcript = transcribe_word_level_confidence_gcs_beta.transcribe_file_with_word_level_confidence(
         GCS_AUDIO_PATH

diff --git a/speech/snippets/transcribe_chirp.py b/speech/snippets/transcribe_chirp.py
@@ -12,20 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import argparse
-
 # [START speech_transcribe_chirp]
+import os
+
 from google.api_core.client_options import ClientOptions
 from google.cloud.speech_v2 import SpeechClient
 from google.cloud.speech_v2.types import cloud_speech
 
+PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")
+
 
 def transcribe_chirp(
-    project_id: str,
     audio_file: str,
 ) -> cloud_speech.RecognizeResponse:
-    """Transcribe an audio file using Chirp."""
+    """Transcribes an audio file using the Chirp model of Google Cloud Speech-to-Text API.
+    Args:
+        audio_file (str): Path to the local audio file to be transcribed.
+            Example: "resources/audio.wav"
+    Returns:
+        cloud_speech.RecognizeResponse: The response from the Speech-to-Text API containing
+        the transcription results.
+
+    """
     # Instantiates a client
     client = SpeechClient(
         client_options=ClientOptions(
@@ -35,7 +43,7 @@ def transcribe_chirp(
 
     # Reads a file as bytes
     with open(audio_file, "rb") as f:
-        content = f.read()
+        audio_content = f.read()
 
     config = cloud_speech.RecognitionConfig(
         auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
@@ -44,9 +52,9 @@ def transcribe_chirp(
     )
 
     request = cloud_speech.RecognizeRequest(
-        recognizer=f"projects/{project_id}/locations/us-central1/recognizers/_",
+        recognizer=f"projects/{PROJECT_ID}/locations/us-central1/recognizers/_",
         config=config,
-        content=content,
+        content=audio_content,
     )
 
     # Transcribes the audio into text
@@ -62,10 +70,4 @@ def transcribe_chirp(
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    parser.add_argument("project_id", help="GCP Project ID")
-    parser.add_argument("audio_file", help="Audio file to stream")
-    args = parser.parse_args()
-    transcribe_chirp(args.project_id, args.audio_file)
+    transcribe_chirp("resources/audio.wav")
diff --git a/speech/snippets/transcribe_chirp_test.py b/speech/snippets/transcribe_chirp_test.py
@@ -24,12 +24,7 @@
 
 @Retry()
 def test_transcribe_chirp() -> None:
-    project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
-
-    response = transcribe_chirp.transcribe_chirp(
-        project_id, os.path.join(_RESOURCES, "audio.wav")
-    )
-
+    response = transcribe_chirp.transcribe_chirp(os.path.join(_RESOURCES, "audio.wav"))
     assert re.search(
         r"how old is the Brooklyn Bridge",
         response.results[0].alternatives[0].transcript,

diff --git a/speech/snippets/transcribe_context_classes.py b/speech/snippets/transcribe_context_classes.py
@@ -16,20 +16,19 @@
 from google.cloud import speech
 
 
-def transcribe_context_classes(storage_uri: str) -> speech.RecognizeResponse:
-    """Provides "hints" to the speech recognizer to
-    favor specific classes of words in the results.
-
+def transcribe_context_classes(audio_uri: str) -> speech.RecognizeResponse:
+    """Provides "hints" to the speech recognizer to favor
+    specific classes of words in the results.
     Args:
-        storage_uri: The URI of the audio file to transcribe.
-
+        audio_uri: The URI of the audio file to transcribe.
+            E.g., gs://[BUCKET]/[FILE]
     Returns:
-        The transcript of the audio file.
+        cloud_speech.RecognizeResponse: The response containing the transcription results.
     """
     client = speech.SpeechClient()
 
-    # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
-    audio = speech.RecognitionAudio(uri=storage_uri)
+    # audio_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
+    audio = speech.RecognitionAudio(uri=audio_uri)
 
     # SpeechContext: to configure your speech_context see:
     # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext

diff --git a/speech/snippets/transcribe_diarization_gcs_beta.py b/speech/snippets/transcribe_diarization_gcs_beta.py
@@ -17,25 +17,24 @@
 from google.cloud import speech
 
 
-def transcribe_diarization_gcs_beta(gcs_uri: str) -> bool:
+def transcribe_diarization_gcs_beta(audio_uri: str) -> bool:
     """Transcribe a remote audio file (stored in Google Cloud Storage) using speaker diarization.
-
     Args:
-        gcs_uri: The Google Cloud Storage path to an audio file.
-
+        audio_uri (str): The Google Cloud Storage path to an audio file.
+            E.g., gs://[BUCKET]/[FILE]
     Returns:
         True if the operation successfully completed, False otherwise.
     """
 
     client = speech.SpeechClient()
-
+    # Enhance diarization config with more speaker counts and details
     speaker_diarization_config = speech.SpeakerDiarizationConfig(
         enable_speaker_diarization=True,
-        min_speaker_count=2,
-        max_speaker_count=2,
+        min_speaker_count=2,  # Set minimum number of speakers
+        max_speaker_count=2,  # Adjust max speakers based on expected number of speakers
     )
 
-    # Configure request to enable Speaker diarization
+    # Configure recognition with enhanced audio settings
     recognition_config = speech.RecognitionConfig(
         encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
         language_code="en-US",
@@ -45,7 +44,7 @@ def transcribe_diarization_gcs_beta(gcs_uri: str) -> bool:
 
     # Set the remote path for the audio file
     audio = speech.RecognitionAudio(
-        uri=gcs_uri,
+        uri=audio_uri,
     )
 
     # Use non-blocking call for getting file transcription
@@ -67,3 +66,8 @@ def transcribe_diarization_gcs_beta(gcs_uri: str) -> bool:
 
 
 # [END speech_transcribe_diarization_gcs_beta]
+
+if __name__ == "__main__":
+    transcribe_diarization_gcs_beta(
+        audio_uri="gs://cloud-samples-data/speech/commercial_mono.wav"
+    )
diff --git a/speech/snippets/transcribe_enhanced_model.py b/speech/snippets/transcribe_enhanced_model.py
@@ -14,27 +14,28 @@
 
 """Google Cloud Speech API sample that demonstrates enhanced models
 and recognition metadata.
-
-Example usage:
-    python transcribe_enhanced_model.py resources/commercial_mono.wav
 """
 # [START speech_transcribe_enhanced_model]
 
-import argparse
-
 from google.cloud import speech
 
 
-def transcribe_file_with_enhanced_model(path: str) -> speech.RecognizeResponse:
-    """Transcribe the given audio file using an enhanced model."""
+def transcribe_file_with_enhanced_model(audio_file: str) -> speech.RecognizeResponse:
+    """Transcribe the given audio file using an enhanced model.
+    Args:
+        audio_file (str): Path to the local audio file to be transcribed.
+            Example: "resources/commercial_mono.wav"
+    Returns:
+        speech.RecognizeResponse: The response containing the transcription results.
+    """
 
     client = speech.SpeechClient()
 
-    # path = 'resources/commercial_mono.wav'
-    with open(path, "rb") as audio_file:
-        content = audio_file.read()
+    # audio_file = 'resources/commercial_mono.wav'
+    with open(audio_file, "rb") as f:
+        audio_content = f.read()
 
-    audio = speech.RecognitionAudio(content=content)
+    audio = speech.RecognitionAudio(content=audio_content)
     config = speech.RecognitionConfig(
         encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
         sample_rate_hertz=8000,
@@ -57,11 +58,4 @@ def transcribe_file_with_enhanced_model(path: str) -> speech.RecognizeResponse:
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    parser.add_argument("path", help="File to stream to the API")
-
-    args = parser.parse_args()
-
-    transcribe_file_with_enhanced_model(args.path)
+    transcribe_file_with_enhanced_model("resources/commercial_mono.wav")
diff --git a/speech/snippets/transcribe_enhanced_model_test.py b/speech/snippets/transcribe_enhanced_model_test.py
@@ -29,5 +29,7 @@ def test_transcribe_file_with_enhanced_model(capsys: pytest.CaptureFixture) -> N
     )
     out, _ = capsys.readouterr()
 
-    assert "Chrome" in out
     assert result is not None
+    assert "Chrome" in out
+    assert "First alternative" in out
+    assert "result 7" in out
diff --git a/speech/snippets/transcribe_multilanguage_gcs_beta.py b/speech/snippets/transcribe_multilanguage_gcs_beta.py
@@ -17,31 +17,30 @@
 from google.cloud import speech_v1p1beta1 as speech
 
 
-def transcribe_file_with_multilanguage_gcs(gcs_uri: str) -> str:
+def transcribe_file_with_multilanguage_gcs(audio_uri: str) -> str:
     """Transcribe a remote audio file with multi-language recognition
-
     Args:
-        gcs_uri: The Google Cloud Storage path to an audio file.
-
+        audio_uri (str): The Google Cloud Storage path to an audio file.
+            E.g., gs://[BUCKET]/[FILE]
     Returns:
-        The generated transcript from the audio file provided.
+        str: The generated transcript from the audio file provided.
     """
 
     client = speech.SpeechClient()
 
-    first_language = "ja-JP"
-    alternate_languages = ["es-ES", "en-US"]
+    first_language = "es-ES"
+    alternate_languages = ["en-US", "fr-FR"]
 
     # Configure request to enable multiple languages
     recognition_config = speech.RecognitionConfig(
-        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
-        sample_rate_hertz=16000,
+        encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
+        sample_rate_hertz=44100,
         language_code=first_language,
         alternative_language_codes=alternate_languages,
     )
 
     # Set the remote path for the audio file
-    audio = speech.RecognitionAudio(uri=gcs_uri)
+    audio = speech.RecognitionAudio(uri=audio_uri)
 
     # Use non-blocking call for getting file transcription
     response = client.long_running_recognize(
@@ -51,9 +50,9 @@ def transcribe_file_with_multilanguage_gcs(gcs_uri: str) -> str:
     transcript_builder = []
     for i, result in enumerate(response.results):
         alternative = result.alternatives[0]
-        transcript_builder.append("-" * 20)
+        transcript_builder.append("-" * 20 + "\n")
         transcript_builder.append(f"First alternative of result {i}: {alternative}")
-        transcript_builder.append(f"Transcript: {alternative.transcript}")
+        transcript_builder.append(f"Transcript: {alternative.transcript} \n")
 
     transcript = "".join(transcript_builder)
     print(transcript)
@@ -62,3 +61,8 @@ def transcribe_file_with_multilanguage_gcs(gcs_uri: str) -> str:
 
 
 # [END speech_transcribe_multilanguage_gcs_beta]
+
+if __name__ == "__main__":
+    transcribe_file_with_multilanguage_gcs(
+        "gs://cloud-samples-data/speech/multi_es.flac"
+    )
diff --git a/speech/snippets/transcribe_multilanguage_gcs_beta_test.py b/speech/snippets/transcribe_multilanguage_gcs_beta_test.py
@@ -0,0 +1,31 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from google.api_core.retry import Retry
+
+import pytest
+
+import transcribe_multilanguage_gcs_beta
+
+
+@Retry()
+def test_transcribe_file_with_multilanguage_gcs(capsys: pytest.CaptureFixture) -> None:
+    audio = "gs://cloud-samples-data/speech/multi_es.flac"
+    response = transcribe_multilanguage_gcs_beta.transcribe_file_with_multilanguage_gcs(
+        audio
+    )
+    out, err = capsys.readouterr()
+
+    assert response is not None
+    assert "estoy" in out