mozilla-ai · Kostis-S-Z · Jan 19, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,6 @@
 # Ignore local artifcats generated by the finetuning job
-demo/artifacts/
-src/speech_to_text_finetune/artifacts/
+artifacts
+local_data
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -38,11 +38,15 @@ This blueprint consists of three independent, yet complementary, components:
 
 ### Suggested flow for this repository
 
+**_Note_**: A HF account is required if you are planning on using the Common Voice dataset for finetuning, or using the text data from Common Voice to make your own STT dataset.
+
 1. Use a virtual environment and install dependencies: `pip install -e .` & [ffmpeg](https://ffmpeg.org) e.g. for Ubuntu: `sudo apt install ffmpeg`, for Mac: `brew install ffmpeg`
 2. Try existing transcription HF models on your own language & voice locally: `python demo/transcribe_app.py`
-3. If you are not happy with the results, you can finetune a model with data of your language from Common Voice
-   1. Configure `config.yaml` with the model, Common Voice dataset id from HF and hyperparameters of your choice.
-   2. Finetune a model: `python src/speech_to_text_finetune/finetune_whisper.py`
+3. If you are not happy with the results, you can finetune a model with data of your language from Common Voice.
+   1. Go to the HF [Common Voice dataset repo](https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0) and request access. It should be approved automatically if you are logged in.
+   2. In your local machine, run the command `huggingface-cli login` and follow the instructions to login to your account.
+   3. Configure `config.yaml` with the model, Common Voice dataset id from HF and hyperparameters of your choice.
+   4. Finetune a model: `python src/speech_to_text_finetune/finetune_whisper.py`
 4. Try again the transcription app with your newly finetuned model.
 5. If the results are still not satisfactory, create your own Speech-to-Text dataset and model.
    1. Create a dataset: `python demo/make_local_dataset_app.py`

diff --git a/demo/app.py → demo/transcribe_app.py b/demo/app.py → demo/transcribe_app.py
@@ -22,9 +22,9 @@ def load_model(model_id: str, language: str) -> Tuple[Pipeline, str]:
             model=model_id,
             generate_kwargs={"language": language},
         )
-        yield pipe, f"Model {model_id} has been loaded."
+        yield pipe, f"✅ Model {model_id} has been loaded."
     else:
-        yield None, "Please select a model and a language from the dropdown"
+        yield None, "⚠️ Please select a model and a language from the dropdown"
 
 
 def transcribe(pipe: Pipeline, audio: gr.Audio) -> str:
@@ -34,19 +34,26 @@ def transcribe(pipe: Pipeline, audio: gr.Audio) -> str:
 
 def setup_gradio_demo():
     with gr.Blocks() as demo:
+        gr.Markdown(
+            """ # 🗣️ Speech-to-Text Transcription
+            ### 1. Select a model and a language from the dropdowns.
+            ### 2. Load the model by clicking the Load model button.
+            ### 3. Record a message and click Transcribe to see the transcription.
+            """
+        )
         ### Model & Language selection ###
         dropdown_model = gr.Dropdown(
             choices=model_ids, value=None, label="Select a model"
         )
         selected_lang = gr.Dropdown(
-            choices=languages, value=None, label="Select a language"
+            choices=list(languages), value=None, label="Select a language"
         )
         load_model_button = gr.Button("Load model")
         model_loaded = gr.Markdown()
 
         ### Transcription ###
         audio_input = gr.Audio(
-            sources="microphone", type="filepath", label="Record a message"
+            sources=["microphone"], type="filepath", label="Record a message"
         )
         transcribe_button = gr.Button("Transcribe")
         transcribe_output = gr.Text(label="Output")

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
   "datasets[audio]",
   "evaluate",
   "jiwer",
-  "gradio",
+  "gradio<=5.12.0",
   "loguru",
   "tensorboard",
   "transformers",

diff --git a/src/speech_to_text_finetune/config.py b/src/speech_to_text_finetune/config.py
@@ -41,13 +41,15 @@ class Config(BaseModel):
     Args:
         model_id (str): HF model id of a Whisper model used for finetuning
         dataset_id (str): HF dataset id of a Common Voice dataset version, ideally from the mozilla-foundation repo
+        dataset_source (str): can be "HF" or "local", to determine from where to fetch the dataset
         language (str): registered language string that is supported by the Common Voice dataset
-        repo_name (str | None): used both for local dir and HF, None will create a name based on the model and language id
+        repo_name (str): used both for local dir and HF, "default" will create a name based on the model and language id
         training_hp (TrainingConfig): store selective hyperparameter values from Seq2SeqTrainingArguments
     """
 
     model_id: str
     dataset_id: str
+    dataset_source: str
     language: str
-    repo_name: str | None
+    repo_name: str
     training_hp: TrainingConfig
diff --git a/src/speech_to_text_finetune/config.yaml b/src/speech_to_text_finetune/config.yaml
@@ -1,24 +1,25 @@
 model_id: openai/whisper-tiny
 dataset_id: mozilla-foundation/common_voice_17_0
+dataset_source: HF
 language: Greek
-repo_name: None
+repo_name: default
 
 training_hp:
-    push_to_hub: False
+    push_to_hub: True
     hub_private_repo: True
-    max_steps: 1
+    max_steps: 20
     per_device_train_batch_size: 64
     gradient_accumulation_steps: 1
     learning_rate: 1e-5
     warmup_steps: 50
     gradient_checkpointing: True
-    fp16: True
+    fp16: False  # If you have a GPU set this to True for faster training
     eval_strategy: steps
     per_device_eval_batch_size: 8
     predict_with_generate: True
     generation_max_length: 225
-    save_steps: 250
-    logging_steps: 25
+    save_steps: 5
+    logging_steps: 5
     load_best_model_at_end: True
     metric_for_best_model: wer
     greater_is_better: False
diff --git a/src/speech_to_text_finetune/data_process.py b/src/speech_to_text_finetune/data_process.py
@@ -1,5 +1,6 @@
 import os
 
+import pandas as pd
 import torch
 from dataclasses import dataclass
 from typing import Dict, List, Union
@@ -10,7 +11,7 @@
     WhisperProcessor,
 )
 
-from datasets import load_dataset, DatasetDict, Audio
+from datasets import load_dataset, DatasetDict, Audio, Dataset
 
 
 def load_common_voice(dataset_id: str, language_id: str) -> DatasetDict:
@@ -21,27 +22,15 @@ def load_common_voice(dataset_id: str, language_id: str) -> DatasetDict:
         language_id: a registered language identifier from Common Voice (most often in ISO-639 format)
 
     Returns:
-        DatasetDict: Hugging Face dictionary that consists of two distinct datasets
+        DatasetDict: HF Dataset dictionary that consists of two distinct Datasets
     """
     common_voice = DatasetDict()
 
     common_voice["train"] = load_dataset(
         dataset_id, language_id, split="train+validation"
     )
     common_voice["test"] = load_dataset(dataset_id, language_id, split="test")
-
-    return common_voice
-
-
-def process_dataset(
-    dataset: DatasetDict,
-    feature_extractor: WhisperFeatureExtractor,
-    tokenizer: WhisperTokenizer,
-) -> DatasetDict:
-    """
-    Process dataset to the expected format by a Whisper model. More info here:
-    """
-    dataset = dataset.remove_columns(
+    common_voice = common_voice.remove_columns(
         [
             "accent",
             "age",
@@ -55,6 +44,48 @@ def process_dataset(
         ]
     )
 
+    return common_voice
+
+
+def load_local_dataset(dataset_dir: str, train_split: float = 0.8) -> DatasetDict:
+    """
+    Load sentences and accompanied recorded audio files into a pandas DataFrame, then split into train/test and finally
+    load it into two distinct train Dataset and test Dataset.
+
+    Sentences and audio files should be indexed like this: <index>: <sentence> should be accompanied by rec_<index>.wav
+
+    Args:
+        dataset_dir (str): path to the local dataset, expecting a text.csv and .wav files under the directory
+        train_split (float): percentage split of the dataset to train+validation and test set
+
+    Returns:
+        DatasetDict: HF Dataset dictionary in the same exact format as the Common Voice dataset from load_common_voice
+    """
+    text_file = dataset_dir + "/text.csv"
+
+    dataframe = pd.read_csv(text_file)
+    audio_files = sorted(
+        [f"{dataset_dir}/{f}" for f in os.listdir(dataset_dir) if f.endswith(".wav")]
+    )
+
+    dataframe["audio"] = audio_files
+    train_index = round(len(dataframe) * train_split)
+
+    my_data = DatasetDict()
+    my_data["train"] = Dataset.from_pandas(dataframe[:train_index])
+    my_data["test"] = Dataset.from_pandas(dataframe[train_index:])
+
+    return my_data
+
+
+def process_dataset(
+    dataset: DatasetDict,
+    feature_extractor: WhisperFeatureExtractor,
+    tokenizer: WhisperTokenizer,
+) -> DatasetDict:
+    """
+    Process dataset to the expected format by a Whisper model. More info here:
+    """
     # Create a new column that consists of the resampled audio samples in the right sample rate for whisper
     dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 

diff --git a/src/speech_to_text_finetune/finetune_whisper.py b/src/speech_to_text_finetune/finetune_whisper.py
@@ -1,3 +1,4 @@
+import json
 from functools import partial
 
 from transformers import (
@@ -18,22 +19,27 @@
 from speech_to_text_finetune.config import load_config
 from speech_to_text_finetune.data_process import (
     load_common_voice,
+    load_local_dataset,
     DataCollatorSpeechSeq2SeqWithPadding,
     process_dataset,
 )
 from speech_to_text_finetune.hf_utils import (
     get_hf_username,
     upload_custom_hf_model_card,
-    get_available_languages_in_cv,
 )
 
 
-def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
+def run_finetuning(
+    config_path: str = "config.yaml",
+    languages_path: str = "languages_common_voice_17_0.json",
+) -> Tuple[Dict, Dict]:
     """
     Complete pipeline for preprocessing the Common Voice dataset and then finetuning a Whisper model on it.
 
     Args:
-        config_path (str): The filepath to a yaml file that follows the format defined in config.py
+        config_path (str): yaml filepath that follows the format defined in config.py
+        languages_path (str): json filepath that stores all languages available for finetuning,
+          see hf_utils/get_available_languages_in_cv() for more details.
 
     Returns:
         Tuple[Dict, Dict]: evaluation metrics from the baseline and the finetuned models
@@ -42,22 +48,29 @@ def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
 
     hf_username = get_hf_username()
 
-    languages_name_to_id = get_available_languages_in_cv(cfg.dataset_id)
+    with open(languages_path) as json_file:
+        languages_name_to_id = json.load(json_file)
     language_id = languages_name_to_id[cfg.language]
 
-    if not cfg.repo_name:
-        cfg.repo_name = f"{cfg.model.model_id.split('/')[1]}-{language_id}"
+    if cfg.repo_name == "default":
+        cfg.repo_name = f"{cfg.model_id.split('/')[1]}-{language_id}"
     hf_repo_name = f"{hf_username}/{cfg.repo_name}"
     local_output_dir = f"./artifacts/{cfg.repo_name}"
 
-    logger.info(
-        f"Finetuning job will soon start. "
-        f"Results will be saved local at {local_output_dir} uploaded in HF at {hf_repo_name}. "
-        f"Private repo is set to {cfg.training_hp.hub_private_repo}."
-    )
+    logger.info(f"Finetuning starts soon, results saved locally at {local_output_dir}")
+    if cfg.training_hp.push_to_hub:
+        logger.info(
+            f"Results will also be uploaded in HF at {hf_repo_name}. "
+            f"Private repo is set to {cfg.training_hp.hub_private_repo}."
+        )
 
     logger.info(f"Loading the {cfg.language} subset from the {cfg.dataset_id} dataset.")
-    dataset = load_common_voice(cfg.dataset_id, language_id)
+    if cfg.dataset_source == "HF":
+        dataset = load_common_voice(cfg.dataset_id, language_id)
+    elif cfg.dataset_source == "local":
+        dataset = load_local_dataset(cfg.dataset_id, train_split=0.8)
+    else:
+        raise ValueError(f"Unknown dataset source {cfg.dataset_source}")
 
     device = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
 
@@ -89,7 +102,7 @@ def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
         output_dir=local_output_dir,
         hub_model_id=hf_repo_name,
         report_to=["tensorboard"],
-        **cfg.training_hp.dict(),
+        **cfg.training_hp.model_dump(),
     )
 
     metric = evaluate.load("wer")
@@ -126,7 +139,7 @@ def run_finetuning(config_path: str = "config.yaml") -> Tuple[Dict, Dict]:
     eval_results = trainer.evaluate()
     logger.info(f"Evaluation complete. Results:\n\t {eval_results}")
 
-    if cfg.training_hp.push_to_hf:
+    if cfg.training_hp.push_to_hub:
         logger.info(f"Uploading model and eval results to HuggingFace: {hf_repo_name}")
         trainer.push_to_hub()
         upload_custom_hf_model_card(
@@ -183,4 +196,7 @@ def compute_word_error_rate(
 
 
 if __name__ == "__main__":
-    run_finetuning(config_path="src/speech_to_text_finetune/config.yaml")
+    run_finetuning(
+        config_path="src/speech_to_text_finetune/config.yaml",
+        languages_path="demo/languages_common_voice_17_0.json",
+    )