Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
9dc7be2
Add audio support
elronbandel Apr 1, 2025
5b1ca57
Merge remote-tracking branch 'origin/main' into add-audio-support
elronbandel Jul 21, 2025
b5c64c3
Update speech benchmark
elronbandel Jul 21, 2025
628d8ff
Change benchmark dataset to esb and fix imports
elronbandel Jul 22, 2025
f416e80
Some updates
elronbandel Jul 24, 2025
1e8fd9e
Comment out subset in benchmark
elronbandel Jul 24, 2025
029019e
Add fluers
elronbandel Jul 24, 2025
7e54146
Add speech translation benchmark
elronbandel Jul 27, 2025
c7576b4
Make sure not to aggregate data in evaluation pipeline
elronbandel Jul 27, 2025
fafe7d2
Merge branch 'main' into add-audio-support
elronbandel Aug 4, 2025
1efb526
Update catalog
elronbandel Aug 4, 2025
d78b610
Update imports
elronbandel Aug 4, 2025
4ce9610
Fix test
elronbandel Aug 4, 2025
bb0be6f
Fix formatting minds 14
elronbandel Aug 4, 2025
98a7ef7
Remove normalized wer
elronbandel Aug 4, 2025
27846f6
Fix post processing pipeline for cases of eager execution
elronbandel Aug 4, 2025
92dedff
Change speech format to old open ai style
elronbandel Aug 6, 2025
3f3777f
Merge branch 'main' into add-audio-support
elronbandel Aug 6, 2025
bd17435
Fix evaluation postprocessing pipeline
elronbandel Aug 6, 2025
3db6a38
Update tests dependencies
elronbandel Aug 6, 2025
efb00d1
Install audio requirements
elronbandel Aug 6, 2025
b80166e
Another try
elronbandel Aug 6, 2025
2e2e096
Update tests dependencies
elronbandel Aug 6, 2025
7412701
Update tests dependencies
elronbandel Aug 6, 2025
ed5eaef
Add torch audio to inference tests dependencies
elronbandel Aug 6, 2025
000f2f2
Merge remote-tracking branch 'origin/main' into add-audio-support
elronbandel Aug 7, 2025
2be0307
Add torchaudio depndency
elronbandel Aug 7, 2025
1f1c536
Update dependencies
elronbandel Aug 7, 2025
1d7cb81
Add reqs
elronbandel Aug 7, 2025
3ea3155
Try specific revision to reduce memory used
elronbandel Aug 10, 2025
39d8fdf
Merge branch 'main' into add-audio-support
elronbandel Aug 13, 2025
95f6f75
Allow skipping tests
elronbandel Aug 13, 2025
c2f9962
Added support for speech recognition and speech translation (#1919)
aharonsatt Aug 14, 2025
e10922d
updates
Aug 14, 2025
4d50103
make covost tests run only when files exist
elronbandel Aug 19, 2025
3ff1fa6
Add directory check before testing card
elronbandel Aug 19, 2025
a3a3cec
Use parquet files for speed
elronbandel Aug 20, 2025
f2655cd
add missing splits for completeness
elronbandel Aug 20, 2025
4db3767
Merge branch 'main' into add-audio-support
elronbandel Aug 20, 2025
d6e0f00
fix
elronbandel Aug 24, 2025
f0d3396
Merge branch 'add-audio-support' of https://github.com/IBM/unitxt int…
elronbandel Aug 24, 2025
7f7399e
Merge branch 'main' into add-audio-support
elronbandel Aug 24, 2025
3d45b48
change to small ASR datasets
Aug 31, 2025
964bc37
change to small ASR datasets
Aug 31, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/examples_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
WML_APIKEY: ${{ secrets.WML_APIKEY }}
GENAI_KEY: ${{ secrets.GENAI_KEY }}
SKIP_HEAVY_LOCAL: "True"

steps:
- uses: actions/checkout@v4
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/inference_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ jobs:
WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }} # Similar to WML_PROJECT_ID
WX_API_KEY: ${{ secrets.WML_APIKEY }} # Similar to WML_APIKEY
GENAI_KEY: ${{ secrets.GENAI_KEY }}
SKIP_HEAVY_LOCAL: "True"


steps:
- uses: actions/checkout@v4
Expand Down
67 changes: 67 additions & 0 deletions examples/evaluate_speech_recognition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# this python script shows an example of running speech recognition evaluation for Granite Speech using the Hugging Face ESB datasets and the CommonVoice datasets

import os

from unitxt import evaluate, load_dataset
from unitxt.inference import (
CrossProviderInferenceEngine,
HFGraniteSpeechInferenceEngine,
)
from unitxt.system_prompts import TextualSystemPrompt

USE_RITS = False # whether to use RITS service
USE_WML = False # whether to use WML service

test_dataset = load_dataset(
# select (uncomment) only one of the following cards (datasets)
# for evaluating a benchmark with multiple cards - see evaluate_speech_recognition_benchmark.py in the same directory (examples)
card="cards.esb.ami",
# card="cards.esb.voxpopuli",
# card="cards.esb.librispeech",
# card="cards.esb.spgispeech",
# card="cards.esb.earnings22",
# card="cards.esb.tedlium",
# card="cards.commonvoice.en"
# card="cards.commonvoice.de"
# card="cards.commonvoice.fr"
# card="cards.commonvoice.es"
# card="cards.commonvoice.pt"
split="test",
format="formats.chat_api",
max_test_instances=5, # to tun limited part of the test set
system_prompt=TextualSystemPrompt(
text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
),
)

if os.environ.get("SKIP_HEAVY_LOCAL", False):
exit()

if not USE_RITS and not USE_WML:
# locally running the model, it needs GPU to run properly
model = HFGraniteSpeechInferenceEngine(
model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b
revision="granite-speech-3.3.2-2b",
max_new_tokens=200,
)
if USE_RITS:
# using the RITS remote service for inferencing
model = CrossProviderInferenceEngine(
model="granite-speech-3-3-8b", # in RITS only the 8b version of Granite Speech is available
provider="rits",
# provider_specific_args={"rits": {"max_new_tokens": 200}},
max_new_tokens=200,
)
if USE_WML:
# using the WML remote service for inferencing
# code to be completed
model = None


predictions = model(test_dataset)
results = evaluate(
predictions=predictions, data=test_dataset, calc_confidence_intervals=False
)

print("Global scores:")
print(results.global_scores.summary)
45 changes: 45 additions & 0 deletions examples/evaluate_speech_recognition_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# this python script shows an example of running speech recognition benchmark evaluation for Granite Speech
# using the Hugging Face ESB datasets (English) and the multilingial CommonVoice datasets

# to run on a single test set use subset=... below; the list of subsets is:
# voxpopuli, ami, librispeech, spgispeech, tedlium, earnings22,
# commonvoice_en, commonvoice_de, commonvoice_es, commonvoice_fr, commonvoice_pt

import os

from unitxt import evaluate, load_dataset
from unitxt.inference import (
HFGraniteSpeechInferenceEngine,
)
from unitxt.system_prompts import TextualSystemPrompt

dataset = load_dataset(
"benchmarks.speech_recognition",
max_samples_per_subset=5, # while this is commented out, the entire test set is used
# subset="ami", #to tun only a single dataset
system_prompt=TextualSystemPrompt(
text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
),
split="test",
)


if os.environ.get("SKIP_HEAVY_LOCAL", False):
exit()

model = HFGraniteSpeechInferenceEngine(
model_name="ibm-granite/granite-speech-3.3-2b", # two options for Granite Speech 3.3: 2b and 8b
revision="granite-speech-3.3.2-2b",
max_new_tokens=200,
)

predictions = model(dataset)
results = evaluate(
predictions=predictions, data=dataset, calc_confidence_intervals=False
)

print("Global scores:")
print(results.global_scores.summary)

print("Subsets scores:")
print(results.subsets_scores.summary)
38 changes: 38 additions & 0 deletions examples/evaluate_speech_translation_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# this python script shows an example of running speech translation benchmark evaluation for Granite Speech
# using the Fleurs and Covost2 datasets

# to run on a single test set use subset=... below; the list of subsets is:
# fleurs_en_de, fleurs_en_es, fleurs_en_fr, fleurs_en_it, fleurs_en_ja, fleurs_en_pt, fleurs_en_pt,
# covost2_en_de, covost2_en_ja, covost2_de_en, covost2_es_en, covost2_fr_en, covost2_pt_en

from unitxt import evaluate, load_dataset
from unitxt.inference import (
HFGraniteSpeechInferenceEngine,
)
from unitxt.system_prompts import TextualSystemPrompt

dataset = load_dataset(
"benchmarks.speech_translation",
# max_samples_per_subset=100, # while this is commented out, the entire test set is used
# subset="fleurs_en_fr", #to run only a single test set
system_prompt=TextualSystemPrompt(
text="Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
),
split="test",
)

model = HFGraniteSpeechInferenceEngine(
model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b
max_new_tokens=200,
)

predictions = model(dataset)
results = evaluate(
predictions=predictions, data=dataset, calc_confidence_intervals=False
)

print("Global scores:")
print(results.global_scores.summary)

print("Subsets scores:")
print(results.subsets_scores.summary)
58 changes: 58 additions & 0 deletions examples/evaluate_speech_translation_covost2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# this python script shows an example of running speech translation evaluation for Granite Speech

from unitxt import evaluate, load_dataset
from unitxt.inference import (
HFGraniteSpeechInferenceEngine,
)
from unitxt.system_prompts import TextualSystemPrompt

debug = True # True for extra printing, set to False when commenting out max_test_instances below
max_test_instances = 8

# the available calanguages for the covost2 dataset dataset, are:
# translation from English to target language:
# de German
# ja Japanese
# translation from source language to English:
# de German
# es Spanish
# fr French
# pt Portuguese
test_dataset = load_dataset( # select (un-comment) one of the test sets below
card="cards.covost2.from_en.en_de",
# card="cards.covost2.from_en.en_ja",
# card="cards.covost2.to_en.de_en",
# card="cards.covost2.to_en.es_en",
# card="cards.covost2.to_en.fr_en",
# card="cards.covost2.to_en.pt_en",
split="test",
format="formats.chat_api",
max_test_instances=max_test_instances, # comment out for running the entire test
system_prompt=TextualSystemPrompt(
text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
),
)

if debug:
print(">>>>>>>>>>>>>> first test references >>>>>>>>>>>>")
for idx in range(max_test_instances):
print(f">>>>>> references {idx}: ", test_dataset["references"][idx])

model = HFGraniteSpeechInferenceEngine(
model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b
max_new_tokens=200,
)

predictions = model(test_dataset)

if debug: # print translation reference texts for debug and inspection
print(">>>>>>>>>>>>>> first predictions >>>>>>>>>>>>")
for idx in range(max_test_instances):
print(f">>>>>>>>>>> {idx}: ", predictions[idx])

results = evaluate(
predictions=predictions, data=test_dataset, calc_confidence_intervals=False
)

print("Global scores:")
print(results.global_scores.summary)
58 changes: 58 additions & 0 deletions examples/evaluate_speech_translation_fleurs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# this python script shows an example of running speech translation evaluation for Granite Speech

from unitxt import evaluate, load_dataset
from unitxt.inference import (
HFGraniteSpeechInferenceEngine,
)
from unitxt.system_prompts import TextualSystemPrompt

debug = False # True for extra printing, set to False when commenting out max_test_instances below
max_test_instances = 20

# the available cards for the fleurs dataset, reflecting the target language, are:
# de_de German
# es_419 Spanish, South America
# fr_fr French
# it_it Italian
# ja_jp Japanese
# pt_br Portuguese, Brazil
# cmn_hans_cn Chinese, Mandarin
test_dataset = load_dataset( # select (un-comment) one of the test sets below
# card="cards.fleurs.en_us.de_de",
# card="cards.fleurs.en_us.es_419",
# card="cards.fleurs.en_us.fr_fr",
# card="cards.fleurs.en_us.it_it",
# card="cards.fleurs.en_us.pt_br",
card="cards.fleurs.en_us.ja_jp",
# card="cards.fleurs.en_us.cmn_hans_cn",
split="test",
format="formats.chat_api",
# max_test_instances=max_test_instances, # comment out for running the entire test
system_prompt=TextualSystemPrompt(
text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
),
)

if debug:
print(">>>>>>>>>>>>>> test references >>>>>>>>>>>>")
for idx in range(max_test_instances):
print(f">>>>>> references {idx}: ", test_dataset["references"][idx])

model = HFGraniteSpeechInferenceEngine(
model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b
max_new_tokens=200,
)

predictions = model(test_dataset)

if debug: # print translation reference texts for debug and inspection
print(">>>>>>>>>>>>>> model predictions >>>>>>>>>>>>")
for idx in range(max_test_instances):
print(f">>>>>>>>>>> {idx}: ", predictions[idx])

results = evaluate(
predictions=predictions, data=test_dataset, calc_confidence_intervals=False
)

print("Global scores:")
print(results.global_scores.summary)
38 changes: 38 additions & 0 deletions prepare/benchmarks/speech_recognition_english.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from unitxt.benchmark import Benchmark
from unitxt.catalog import add_to_catalog
from unitxt.standard import DatasetRecipe

benchmark = Benchmark(
subsets={
"voxpopuli": DatasetRecipe(
card="cards.esb.voxpopuli",
format="formats.chat_api",
),
"ami": DatasetRecipe(
card="cards.esb.ami",
format="formats.chat_api",
),
"librispeech": DatasetRecipe(
card="cards.esb.librispeech",
format="formats.chat_api",
),
"spgispeech": DatasetRecipe(
card="cards.esb.spgispeech",
format="formats.chat_api",
),
"tedlium": DatasetRecipe(
card="cards.esb.tedlium",
format="formats.chat_api",
),
"earnings22": DatasetRecipe(
card="cards.esb.earnings22",
format="formats.chat_api",
),
"commonvoice": DatasetRecipe(
card="cards.esb.commonvoice",
format="formats.chat_api",
),
},
)

add_to_catalog(benchmark, "benchmarks.speech_recognition", overwrite=True)
26 changes: 26 additions & 0 deletions prepare/benchmarks/speech_recognition_multilingual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from unitxt.benchmark import Benchmark
from unitxt.catalog import add_to_catalog
from unitxt.standard import DatasetRecipe

benchmark = Benchmark(
subsets={
"commonvoice_de": DatasetRecipe(
card="cards.commonvoice.de",
format="formats.chat_api",
),
"commonvoice_es": DatasetRecipe(
card="cards.commonvoice.es",
format="formats.chat_api",
),
"commonvoice_fr": DatasetRecipe(
card="cards.commonvoice.fr",
format="formats.chat_api",
),
"commonvoice_pt": DatasetRecipe(
card="cards.commonvoice.pt",
format="formats.chat_api",
),
},
)

add_to_catalog(benchmark, "benchmarks.speech_recognition_multilingual", overwrite=True)
Loading
Loading