IBM · elronbandel · Apr 1, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 22, 2025
diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml
@@ -30,6 +30,7 @@ jobs:
       WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
       WML_APIKEY: ${{ secrets.WML_APIKEY }}
       GENAI_KEY: ${{ secrets.GENAI_KEY }}
+      SKIP_HEAVY_LOCAL: "True"
 
     steps:
     - uses: actions/checkout@v4

diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
@@ -32,6 +32,8 @@ jobs:
       WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }} # Similar to WML_PROJECT_ID
       WX_API_KEY: ${{ secrets.WML_APIKEY }} # Similar to WML_APIKEY
       GENAI_KEY: ${{ secrets.GENAI_KEY }}
+      SKIP_HEAVY_LOCAL: "True"
+
 
     steps:
       - uses: actions/checkout@v4

diff --git a/examples/evaluate_speech_recognition.py b/examples/evaluate_speech_recognition.py
@@ -0,0 +1,67 @@
+# this python script shows an example of running speech recognition evaluation for Granite Speech using the Hugging Face ESB datasets and the CommonVoice datasets
+
+import os
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+USE_RITS = False  #  whether to use RITS service
+USE_WML = False  #  whether to use WML service
+
+test_dataset = load_dataset(
+    # select (uncomment) only one of the following cards (datasets)
+    # for evaluating a benchmark with multiple cards - see evaluate_speech_recognition_benchmark.py in the same directory (examples)
+    card="cards.esb.ami",
+    # card="cards.esb.voxpopuli",
+    # card="cards.esb.librispeech",
+    # card="cards.esb.spgispeech",
+    # card="cards.esb.earnings22",
+    # card="cards.esb.tedlium",
+    # card="cards.commonvoice.en"
+    # card="cards.commonvoice.de"
+    # card="cards.commonvoice.fr"
+    # card="cards.commonvoice.es"
+    # card="cards.commonvoice.pt"
+    split="test",
+    format="formats.chat_api",
+    max_test_instances=5,  # to tun limited part of the test set
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+)
+
+if os.environ.get("SKIP_HEAVY_LOCAL", False):
+    exit()
+
+if not USE_RITS and not USE_WML:
+    # locally running the model, it needs GPU to run properly
+    model = HFGraniteSpeechInferenceEngine(
+        model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+        revision="granite-speech-3.3.2-2b",
+        max_new_tokens=200,
+    )
+if USE_RITS:
+    # using the RITS remote service for inferencing
+    model = CrossProviderInferenceEngine(
+        model="granite-speech-3-3-8b",  # in RITS only the 8b version of Granite Speech is available
+        provider="rits",
+        # provider_specific_args={"rits": {"max_new_tokens": 200}},
+        max_new_tokens=200,
+    )
+if USE_WML:
+    # using the WML remote service for inferencing
+    # code to be completed
+    model = None
+
+
+predictions = model(test_dataset)
+results = evaluate(
+    predictions=predictions, data=test_dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
diff --git a/examples/evaluate_speech_recognition_benchmark.py b/examples/evaluate_speech_recognition_benchmark.py
@@ -0,0 +1,45 @@
+# this python script shows an example of running speech recognition benchmark evaluation for Granite Speech
+# using the Hugging Face ESB datasets (English) and the multilingial CommonVoice datasets
+
+# to run on a single test set use subset=... below; the list of subsets is:
+# voxpopuli, ami, librispeech, spgispeech, tedlium, earnings22,
+# commonvoice_en, commonvoice_de, commonvoice_es, commonvoice_fr, commonvoice_pt
+
+import os
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+dataset = load_dataset(
+    "benchmarks.speech_recognition",
+    max_samples_per_subset=5,  # while this is commented out, the entire test set is used
+    # subset="ami",   #to tun only a single dataset
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+    split="test",
+)
+
+
+if os.environ.get("SKIP_HEAVY_LOCAL", False):
+    exit()
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-2b",  # two options for Granite Speech 3.3:  2b  and  8b
+    revision="granite-speech-3.3.2-2b",
+    max_new_tokens=200,
+)
+
+predictions = model(dataset)
+results = evaluate(
+    predictions=predictions, data=dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
+
+print("Subsets scores:")
+print(results.subsets_scores.summary)
diff --git a/examples/evaluate_speech_translation_benchmark.py b/examples/evaluate_speech_translation_benchmark.py
@@ -0,0 +1,38 @@
+# this python script shows an example of running speech translation benchmark evaluation for Granite Speech
+# using the Fleurs and Covost2 datasets
+
+# to run on a single test set use subset=... below; the list of subsets is:
+# fleurs_en_de, fleurs_en_es, fleurs_en_fr, fleurs_en_it, fleurs_en_ja, fleurs_en_pt, fleurs_en_pt,
+# covost2_en_de, covost2_en_ja, covost2_de_en, covost2_es_en, covost2_fr_en, covost2_pt_en
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+dataset = load_dataset(
+    "benchmarks.speech_translation",
+    # max_samples_per_subset=100,   # while this is commented out, the entire test set is used
+    # subset="fleurs_en_fr",        #to run only a single test set
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+    split="test",
+)
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+    max_new_tokens=200,
+)
+
+predictions = model(dataset)
+results = evaluate(
+    predictions=predictions, data=dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
+
+print("Subsets scores:")
+print(results.subsets_scores.summary)
diff --git a/examples/evaluate_speech_translation_covost2.py b/examples/evaluate_speech_translation_covost2.py
@@ -0,0 +1,58 @@
+# this python script shows an example of running speech translation evaluation for Granite Speech
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+debug = True  # True for extra printing, set to False when commenting out max_test_instances below
+max_test_instances = 8
+
+# the available calanguages for the covost2 dataset dataset, are:
+#  translation from English to target language:
+#   de        German
+#   ja        Japanese
+#  translation from source language to English:
+#   de        German
+#   es        Spanish
+#   fr        French
+#   pt        Portuguese
+test_dataset = load_dataset(  # select (un-comment) one of the test sets below
+    card="cards.covost2.from_en.en_de",
+    # card="cards.covost2.from_en.en_ja",
+    # card="cards.covost2.to_en.de_en",
+    # card="cards.covost2.to_en.es_en",
+    # card="cards.covost2.to_en.fr_en",
+    # card="cards.covost2.to_en.pt_en",
+    split="test",
+    format="formats.chat_api",
+    max_test_instances=max_test_instances,  # comment out for running the entire test
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+)
+
+if debug:
+    print(">>>>>>>>>>>>>>  first test references  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>   references {idx}:  ", test_dataset["references"][idx])
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+    max_new_tokens=200,
+)
+
+predictions = model(test_dataset)
+
+if debug:  # print translation reference texts for debug and inspection
+    print(">>>>>>>>>>>>>>  first predictions  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>>>>>> {idx}:   ", predictions[idx])
+
+results = evaluate(
+    predictions=predictions, data=test_dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
diff --git a/examples/evaluate_speech_translation_fleurs.py b/examples/evaluate_speech_translation_fleurs.py
@@ -0,0 +1,58 @@
+# this python script shows an example of running speech translation evaluation for Granite Speech
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+debug = False  # True for extra printing, set to False when commenting out max_test_instances below
+max_test_instances = 20
+
+# the available cards for the fleurs dataset, reflecting the target language, are:
+#   de_de           German
+#   es_419          Spanish, South America
+#   fr_fr           French
+#   it_it           Italian
+#   ja_jp           Japanese
+#   pt_br           Portuguese, Brazil
+#   cmn_hans_cn     Chinese, Mandarin
+test_dataset = load_dataset(  # select (un-comment) one of the test sets below
+    # card="cards.fleurs.en_us.de_de",
+    # card="cards.fleurs.en_us.es_419",
+    # card="cards.fleurs.en_us.fr_fr",
+    # card="cards.fleurs.en_us.it_it",
+    # card="cards.fleurs.en_us.pt_br",
+    card="cards.fleurs.en_us.ja_jp",
+    # card="cards.fleurs.en_us.cmn_hans_cn",
+    split="test",
+    format="formats.chat_api",
+    # max_test_instances=max_test_instances,  # comment out for running the entire test
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+)
+
+if debug:
+    print(">>>>>>>>>>>>>>  test references  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>   references {idx}:  ", test_dataset["references"][idx])
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+    max_new_tokens=200,
+)
+
+predictions = model(test_dataset)
+
+if debug:  # print translation reference texts for debug and inspection
+    print(">>>>>>>>>>>>>>  model predictions  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>>>>>> {idx}:   ", predictions[idx])
+
+results = evaluate(
+    predictions=predictions, data=test_dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
diff --git a/prepare/benchmarks/speech_recognition_english.py b/prepare/benchmarks/speech_recognition_english.py
@@ -0,0 +1,38 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+benchmark = Benchmark(
+    subsets={
+        "voxpopuli": DatasetRecipe(
+            card="cards.esb.voxpopuli",
+            format="formats.chat_api",
+        ),
+        "ami": DatasetRecipe(
+            card="cards.esb.ami",
+            format="formats.chat_api",
+        ),
+        "librispeech": DatasetRecipe(
+            card="cards.esb.librispeech",
+            format="formats.chat_api",
+        ),
+        "spgispeech": DatasetRecipe(
+            card="cards.esb.spgispeech",
+            format="formats.chat_api",
+        ),
+        "tedlium": DatasetRecipe(
+            card="cards.esb.tedlium",
+            format="formats.chat_api",
+        ),
+        "earnings22": DatasetRecipe(
+            card="cards.esb.earnings22",
+            format="formats.chat_api",
+        ),
+        "commonvoice": DatasetRecipe(
+            card="cards.esb.commonvoice",
+            format="formats.chat_api",
+        ),
+    },
+)
+
+add_to_catalog(benchmark, "benchmarks.speech_recognition", overwrite=True)
diff --git a/prepare/benchmarks/speech_recognition_multilingual.py b/prepare/benchmarks/speech_recognition_multilingual.py
@@ -0,0 +1,26 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+benchmark = Benchmark(
+    subsets={
+        "commonvoice_de": DatasetRecipe(
+            card="cards.commonvoice.de",
+            format="formats.chat_api",
+        ),
+        "commonvoice_es": DatasetRecipe(
+            card="cards.commonvoice.es",
+            format="formats.chat_api",
+        ),
+        "commonvoice_fr": DatasetRecipe(
+            card="cards.commonvoice.fr",
+            format="formats.chat_api",
+        ),
+        "commonvoice_pt": DatasetRecipe(
+            card="cards.commonvoice.pt",
+            format="formats.chat_api",
+        ),
+    },
+)
+
+add_to_catalog(benchmark, "benchmarks.speech_recognition_multilingual", overwrite=True)