Skip to content

Commit

Permalink
Simplified speech samples (set-2) (GoogleCloudPlatform#12120)
Browse files Browse the repository at this point in the history
* Simplified next set of Samples

* Fix nits and added test

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
Thoughtseize1 and gcf-owl-bot[bot] authored Aug 14, 2024
1 parent 6d59d2d commit de4ad98
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 82 deletions.
11 changes: 0 additions & 11 deletions speech/snippets/transcribe_async_gcs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@

import transcribe_async_gcs
import transcribe_diarization_gcs_beta
import transcribe_multilanguage_gcs_beta
import transcribe_word_level_confidence_gcs_beta

BUCKET = "cloud-samples-data"
GCS_AUDIO_PATH = "gs://" + BUCKET + "/speech/brooklyn_bridge.flac"
GCS_DIARIZATION_AUDIO_PATH = "gs://" + BUCKET + "/speech/commercial_mono.wav"
GCS_MUTLILANGUAGE_PATH = "gs://" + BUCKET + "/speech/Google_Gnome.wav"


@Retry()
Expand All @@ -40,15 +38,6 @@ def test_transcribe_diarization_gcs_beta() -> None:
assert is_completed


def test_transcribe_multilanguage_gcs_bets() -> None:
transcript = (
transcribe_multilanguage_gcs_beta.transcribe_file_with_multilanguage_gcs(
GCS_MUTLILANGUAGE_PATH
)
)
assert re.search("Transcript: OK Google", transcript)


def test_transcribe_word_level_confidence_gcs_beta() -> None:
transcript = transcribe_word_level_confidence_gcs_beta.transcribe_file_with_word_level_confidence(
GCS_AUDIO_PATH
Expand Down
32 changes: 17 additions & 15 deletions speech/snippets/transcribe_chirp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,28 @@
# See the License for the specific language governing permissions and
# limitations under the License.


import argparse

# [START speech_transcribe_chirp]
import os

from google.api_core.client_options import ClientOptions
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech

PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT")


def transcribe_chirp(
project_id: str,
audio_file: str,
) -> cloud_speech.RecognizeResponse:
"""Transcribe an audio file using Chirp."""
"""Transcribes an audio file using the Chirp model of Google Cloud Speech-to-Text API.
Args:
audio_file (str): Path to the local audio file to be transcribed.
Example: "resources/audio.wav"
Returns:
cloud_speech.RecognizeResponse: The response from the Speech-to-Text API containing
the transcription results.
"""
# Instantiates a client
client = SpeechClient(
client_options=ClientOptions(
Expand All @@ -35,7 +43,7 @@ def transcribe_chirp(

# Reads a file as bytes
with open(audio_file, "rb") as f:
content = f.read()
audio_content = f.read()

config = cloud_speech.RecognitionConfig(
auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
Expand All @@ -44,9 +52,9 @@ def transcribe_chirp(
)

request = cloud_speech.RecognizeRequest(
recognizer=f"projects/{project_id}/locations/us-central1/recognizers/_",
recognizer=f"projects/{PROJECT_ID}/locations/us-central1/recognizers/_",
config=config,
content=content,
content=audio_content,
)

# Transcribes the audio into text
Expand All @@ -62,10 +70,4 @@ def transcribe_chirp(


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("project_id", help="GCP Project ID")
parser.add_argument("audio_file", help="Audio file to stream")
args = parser.parse_args()
transcribe_chirp(args.project_id, args.audio_file)
transcribe_chirp("resources/audio.wav")
7 changes: 1 addition & 6 deletions speech/snippets/transcribe_chirp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,7 @@

@Retry()
def test_transcribe_chirp() -> None:
project_id = os.getenv("GOOGLE_CLOUD_PROJECT")

response = transcribe_chirp.transcribe_chirp(
project_id, os.path.join(_RESOURCES, "audio.wav")
)

response = transcribe_chirp.transcribe_chirp(os.path.join(_RESOURCES, "audio.wav"))
assert re.search(
r"how old is the Brooklyn Bridge",
response.results[0].alternatives[0].transcript,
Expand Down
17 changes: 8 additions & 9 deletions speech/snippets/transcribe_context_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,19 @@
from google.cloud import speech


def transcribe_context_classes(storage_uri: str) -> speech.RecognizeResponse:
"""Provides "hints" to the speech recognizer to
favor specific classes of words in the results.
def transcribe_context_classes(audio_uri: str) -> speech.RecognizeResponse:
"""Provides "hints" to the speech recognizer to favor
specific classes of words in the results.
Args:
storage_uri: The URI of the audio file to transcribe.
audio_uri: The URI of the audio file to transcribe.
E.g., gs://[BUCKET]/[FILE]
Returns:
The transcript of the audio file.
cloud_speech.RecognizeResponse: The response containing the transcription results.
"""
client = speech.SpeechClient()

# storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.RecognitionAudio(uri=storage_uri)
# audio_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav'
audio = speech.RecognitionAudio(uri=audio_uri)

# SpeechContext: to configure your speech_context see:
# https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext
Expand Down
22 changes: 13 additions & 9 deletions speech/snippets/transcribe_diarization_gcs_beta.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,24 @@
from google.cloud import speech


def transcribe_diarization_gcs_beta(gcs_uri: str) -> bool:
def transcribe_diarization_gcs_beta(audio_uri: str) -> bool:
"""Transcribe a remote audio file (stored in Google Cloud Storage) using speaker diarization.
Args:
gcs_uri: The Google Cloud Storage path to an audio file.
audio_uri (str): The Google Cloud Storage path to an audio file.
E.g., gs://[BUCKET]/[FILE]
Returns:
True if the operation successfully completed, False otherwise.
"""

client = speech.SpeechClient()

# Enhance diarization config with more speaker counts and details
speaker_diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
max_speaker_count=2,
min_speaker_count=2, # Set minimum number of speakers
max_speaker_count=2, # Adjust max speakers based on expected number of speakers
)

# Configure request to enable Speaker diarization
# Configure recognition with enhanced audio settings
recognition_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
language_code="en-US",
Expand All @@ -45,7 +44,7 @@ def transcribe_diarization_gcs_beta(gcs_uri: str) -> bool:

# Set the remote path for the audio file
audio = speech.RecognitionAudio(
uri=gcs_uri,
uri=audio_uri,
)

# Use non-blocking call for getting file transcription
Expand All @@ -67,3 +66,8 @@ def transcribe_diarization_gcs_beta(gcs_uri: str) -> bool:


# [END speech_transcribe_diarization_gcs_beta]

if __name__ == "__main__":
transcribe_diarization_gcs_beta(
audio_uri="gs://cloud-samples-data/speech/commercial_mono.wav"
)
32 changes: 13 additions & 19 deletions speech/snippets/transcribe_enhanced_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,28 @@

"""Google Cloud Speech API sample that demonstrates enhanced models
and recognition metadata.
Example usage:
python transcribe_enhanced_model.py resources/commercial_mono.wav
"""
# [START speech_transcribe_enhanced_model]

import argparse

from google.cloud import speech


def transcribe_file_with_enhanced_model(path: str) -> speech.RecognizeResponse:
"""Transcribe the given audio file using an enhanced model."""
def transcribe_file_with_enhanced_model(audio_file: str) -> speech.RecognizeResponse:
"""Transcribe the given audio file using an enhanced model.
Args:
audio_file (str): Path to the local audio file to be transcribed.
Example: "resources/commercial_mono.wav"
Returns:
speech.RecognizeResponse: The response containing the transcription results.
"""

client = speech.SpeechClient()

# path = 'resources/commercial_mono.wav'
with open(path, "rb") as audio_file:
content = audio_file.read()
# audio_file = 'resources/commercial_mono.wav'
with open(audio_file, "rb") as f:
audio_content = f.read()

audio = speech.RecognitionAudio(content=content)
audio = speech.RecognitionAudio(content=audio_content)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=8000,
Expand All @@ -57,11 +58,4 @@ def transcribe_file_with_enhanced_model(path: str) -> speech.RecognizeResponse:


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
parser.add_argument("path", help="File to stream to the API")

args = parser.parse_args()

transcribe_file_with_enhanced_model(args.path)
transcribe_file_with_enhanced_model("resources/commercial_mono.wav")
4 changes: 3 additions & 1 deletion speech/snippets/transcribe_enhanced_model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,7 @@ def test_transcribe_file_with_enhanced_model(capsys: pytest.CaptureFixture) -> N
)
out, _ = capsys.readouterr()

assert "Chrome" in out
assert result is not None
assert "Chrome" in out
assert "First alternative" in out
assert "result 7" in out
28 changes: 16 additions & 12 deletions speech/snippets/transcribe_multilanguage_gcs_beta.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,31 +17,30 @@
from google.cloud import speech_v1p1beta1 as speech


def transcribe_file_with_multilanguage_gcs(gcs_uri: str) -> str:
def transcribe_file_with_multilanguage_gcs(audio_uri: str) -> str:
"""Transcribe a remote audio file with multi-language recognition
Args:
gcs_uri: The Google Cloud Storage path to an audio file.
audio_uri (str): The Google Cloud Storage path to an audio file.
E.g., gs://[BUCKET]/[FILE]
Returns:
The generated transcript from the audio file provided.
str: The generated transcript from the audio file provided.
"""

client = speech.SpeechClient()

first_language = "ja-JP"
alternate_languages = ["es-ES", "en-US"]
first_language = "es-ES"
alternate_languages = ["en-US", "fr-FR"]

# Configure request to enable multiple languages
recognition_config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=16000,
encoding=speech.RecognitionConfig.AudioEncoding.FLAC,
sample_rate_hertz=44100,
language_code=first_language,
alternative_language_codes=alternate_languages,
)

# Set the remote path for the audio file
audio = speech.RecognitionAudio(uri=gcs_uri)
audio = speech.RecognitionAudio(uri=audio_uri)

# Use non-blocking call for getting file transcription
response = client.long_running_recognize(
Expand All @@ -51,9 +50,9 @@ def transcribe_file_with_multilanguage_gcs(gcs_uri: str) -> str:
transcript_builder = []
for i, result in enumerate(response.results):
alternative = result.alternatives[0]
transcript_builder.append("-" * 20)
transcript_builder.append("-" * 20 + "\n")
transcript_builder.append(f"First alternative of result {i}: {alternative}")
transcript_builder.append(f"Transcript: {alternative.transcript}")
transcript_builder.append(f"Transcript: {alternative.transcript} \n")

transcript = "".join(transcript_builder)
print(transcript)
Expand All @@ -62,3 +61,8 @@ def transcribe_file_with_multilanguage_gcs(gcs_uri: str) -> str:


# [END speech_transcribe_multilanguage_gcs_beta]

if __name__ == "__main__":
transcribe_file_with_multilanguage_gcs(
"gs://cloud-samples-data/speech/multi_es.flac"
)
31 changes: 31 additions & 0 deletions speech/snippets/transcribe_multilanguage_gcs_beta_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from google.api_core.retry import Retry

import pytest

import transcribe_multilanguage_gcs_beta


@Retry()
def test_transcribe_file_with_multilanguage_gcs(capsys: pytest.CaptureFixture) -> None:
audio = "gs://cloud-samples-data/speech/multi_es.flac"
response = transcribe_multilanguage_gcs_beta.transcribe_file_with_multilanguage_gcs(
audio
)
out, err = capsys.readouterr()

assert response is not None
assert "estoy" in out

0 comments on commit de4ad98

Please sign in to comment.