Azure-Samples · karinassini · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -0,0 +1,14 @@
+name: pre-commit
+on: [pull_request, push]
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+      - name: Install pre-commit
+        run: python -m pip install --upgrade pre-commit
+      - name: Run pre-commit
+        run: pre-commit run --all-files
diff --git a/.gitignore b/.gitignore
@@ -427,4 +427,24 @@ transliteration*
 .venv*
 infra/target*
 .vscode*
-junk/
+junk/
+*.code-workspace
+
+# Machine learning artifacts and cache directories
+mlruns/
+custom_data/
+output_model_dir/
+nemo_rnnt_da/
+training_console.log
+cache*
+.mypy_cache/
+.ruff_cache/
+
+# Local dataset directories
+apps/whisper_fine_tuning/data/
+
+*.DS_Store
+
+data/
+
+predictions_dir/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,44 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black
+        language_version: python3.11  # require Python 3.11 or newer
+
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.1.0
+    hooks:
+      - id: ruff
+        args: [--fix, --extend-ignore, E402]   # ruff will auto-fix many issues
+        additional_dependencies: []
+
+  - repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: check-added-large-files
+
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.6.1              # pick a valid tag you confirmed with git ls-remote
+    hooks:
+      - id: mypy
+        # keep commonly useful flags, then selectively disable error codes reported by mypy
+        args:
+          - --ignore-missing-imports
+          - --disable-error-code=import-untyped
+          - --disable-error-code=call-arg
+          - --disable-error-code=union-attr
+          - --disable-error-code=arg-type
+          - --disable-error-code=used-before-def
+          - --disable-error-code=attr-defined
+        files: \.py$
+        language_version: python3.11
diff --git a/apps/whisper_fine_tuning/Makefile b/apps/whisper_fine_tuning/Makefile
@@ -0,0 +1,49 @@
+export HOME := $(HOME)
+.ONESHELL:
+
+ifeq ($(OS),Windows_NT)
+	SHELL = cmd
+	CONDA_ACTIVATE = call %CONDA_PREFIX%\Scripts\activate.bat
+else
+	SHELL = /bin/bash
+	CONDA_ACTIVATE = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
+endif
+
+setup_aml:
+	rm -rf ~/.pyenv
+	curl https://pyenv.run | bash
+	$(HOME)/.pyenv/bin/pyenv --version
+	$(HOME)/.pyenv/bin/pyenv install 3.12 --skip-existing
+	$(HOME)/.pyenv/bin/pyenv local 3.12
+	python --version
+	conda create -n condav0 python=3.12
+	$(CONDA_ACTIVATE) activate condav0
+	conda install -c conda-forge poetry
+	poetry config virtualenvs.create true
+	poetry config virtualenvs.in-project true
+	poetry lock --no-update
+	poetry install
+	conda install pip
+	conda install -c conda-forge "ffmpeg>=5,<7"
+	sudo apt update && sudo apt install -y ffmpeg
+	python -m ipykernel install --user --name condav0 --display-name "condav0"
+
+
+USERPROFILE := $(USERPROFILE)
+CURRENT_DIR := $(shell cd)
+setup_win:
+	if exist %USERPROFILE%\.pyenv rmdir /s /q %USERPROFILE%\.pyenv
+	git clone https://github.com/pyenv-win/pyenv-win.git "%USERPROFILE%\.pyenv"
+	$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv --version
+	$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv install 3.12 --skip-existing
+	$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv local 3.12
+	python --version
+	python -m venv venv
+	echo $(CURRENT_DIR)
+	"$(CURRENT_DIR)/venv/Scripts/activate"
+	pip install poetry
+	poetry config virtualenvs.create true
+	poetry config virtualenvs.in-project true
+	poetry lock
+	poetry install
+	conda install pip
diff --git a/apps/whisper_fine_tuning/README.md b/apps/whisper_fine_tuning/README.md
@@ -0,0 +1,111 @@
+# Whisper Fine-Tuning Pipeline
+
+This project provides an end-to-end workflow for preparing custom audio datasets, fine-tuning OpenAI’s Whisper models with LoRA/PEFT, and operationalizing the resulting checkpoints on Azure ML.
+
+## Highlights
+- LoRA-based fine-tuning scripts that minimize GPU memory requirements.
+- Data ingestion utilities that convert raw audio into Hugging Face `datasets` format.
+- MLflow integration for experiment tracking and artifact storage.
+- Azure ML job definitions for cloud-scale training and evaluation.
+
+## Project Layout
+```
+apps/whisper_fine_tuning/
+├── deployment/                # Azure ML job specs and training entrypoints
+├── docs/                      # Diagrams and supporting documentation
+├── infra/                     # Infrastructure-as-code templates
+├── notebooks/                 # Exploratory analysis and inference notebooks
+├── src/core/                  # Data prep, training, and evaluation modules
+└── data/                      # Example datasets (raw/silver) – ignored in git
+```
+
+## Quick Start
+1. **Install dependencies**
+   ```bash
+   poetry env use 3.12
+   poetry install
+   ```
+   Activate the environment with `poetry shell` or your preferred virtualenv tool.
+
+2. **Prepare raw audio**
+   ```bash
+   python src/core/data_prep/main_data_prep.py \
+     --source_data_dir data/raw/audios/matis \
+     --output_data_dir data/raw/training \
+     --domain train
+   ```
+   Repeat for `--domain evaluation` and `--domain test`. See `src/core/data_prep/README.md` for more options.
+
+3. **Generate silver dataset**
+   ```bash
+   python src/core/data_prep/main_silver_data_prep.py \
+     --train_datasets data/raw/training \
+     --eval_datasets data/raw/evaluation \
+     --test_datasets data/raw/testing
+   ```
+
+4. **Train Whisper with LoRA**
+   ```bash
+   python src/core/train/main_train.py \
+     --model_name openai/whisper-large-v2 \
+     --dataset data/silver/dataset \
+     --language Matis \
+     --output_dir output_model_dir \
+     --apply_lora True
+   ```
+
+## Running on Azure ML
+Submit the packaged job using the provided YAML spec:
+```bash
+az ml job create --file deployment/training_job.yaml
+```
+Customize compute, environment, and inputs inside the YAML before submission.
+
+## Evaluation & Inference
+- Offline evaluation: `python src/core/evaluation/evaluation_process.py --eval_datasets data/raw/testing ...`
+- NeMo experiments: `python src/core/train/main_train_nemo.py --dataset_path data/silver/dataset ...`
+- Inference notebook: `notebooks/fine_tuned_usage.ipynb`
+
+## Key CLI Arguments
+| Script | Argument | Description |
+| --- | --- | --- |
+| `main_data_prep.py` | `--source_data_dir` | Folder with `audio_paths` and `text` mappings |
+|                     | `--domain` | `train`, `test`, `evaluation`, or `all` |
+| `main_silver_data_prep.py` | `--train_datasets`/`--eval_datasets` | Hugging Face datasets produced by the raw prep stage |
+| `main_train.py` | `--model_name` | Base Whisper checkpoint (default `openai/whisper-small`) |
+|                  | `--apply_lora` | Enable/disable LoRA adapters |
+|                  | `--experiment_name` | MLflow experiment name; auto-generated if omitted |
+
+Full option lists live in each script’s `--help` output.
+
+## MLflow & Logging
+- Runs log configuration, metrics, and console output (see `training_console.log`).
+- Checkpoints are written to `output_model_dir/` and can be registered with MLflow or uploaded to Azure.
+
+## Pre-Commit Hooks
+```
+pip install pre-commit
+pre-commit install
+pre-commit run --all-files
+```
+Hooks enforce formatting and linting before changes land in version control.
+
+## Troubleshooting
+- Verify `environment.yml` or `pyproject.toml` dependencies are installed.
+- Ensure datasets follow the expected directory structure (`data/raw/audios/...`).
+- For Azure ML issues, confirm workspace credentials and compute targets.
+
+## Fine-Tuning Tips
+| Parameter | Recommended | Why |
+| --- | --- | --- |
+| `per_device_train_batch_size` | 4–8 | Small datasets benefit from smaller batches |
+| `num_train_epochs` | 10–30 | Compensate for limited data with more passes |
+| `warmup_steps` | 50–100 | Faster ramp-up than large fixed counts |
+| `eval_strategy` | `epoch` | Evaluate once per epoch to avoid noise |
+| `save_strategy` | `epoch` | Align checkpointing with evaluation |
+| `load_best_model_at_end` | `True` | Automatically keep the best-performing checkpoint |
+
+Additional guidance on data augmentation, freezing layers, early stopping, and MLflow logging is summarized in `src/core/train/README.md`.
+
+## License
+Licensed under the MIT License. Contributions and issues are welcome.
diff --git a/apps/whisper_fine_tuning/deployment/endpoint/inference.py b/apps/whisper_fine_tuning/deployment/endpoint/inference.py
@@ -0,0 +1,70 @@
+import urllib.request
+import json
+from dotenv import load_dotenv
+import os
+import base64, json, os, pathlib, requests
+
+
+# Request data goes here
+# The example below assumes JSON formatting which may be updated
+# depending on the format your endpoint expects.
+# More information can be found here:
+# https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
+load_dotenv()  # Load environment variables from a .env file
+
+audio_path = pathlib.Path(
+    "/Users/karinaassiniandreatta/Documents/06 microsoft_codes/azuresamples/whisper-fine-tuning/src/core/create_data/matis/train_data/audio2.ogg"
+)
+audio_b64 = base64.b64encode(audio_path.read_bytes()).decode("utf-8")
+
+payload = {
+    "input_data": {
+        "columns": ["audio_base64"],
+        "index": [0],
+        "data": [[audio_b64]],
+    },
+    "params": {},
+}
+body = str.encode(json.dumps(payload))
+
+url = os.getenv(
+    "SCORE_ENDPOINT",
+    "https://ml-sandbox-core-hdydq.eastus2.inference.ml.azure.com/score",
+)
+# Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
+api_key = os.getenv("API_KEY")
+if not api_key:
+    raise Exception("A key should be provided to invoke the endpoint")
+
+
+headers = {
+    "Content-Type": "application/json",
+    "Accept": "application/json",
+    "Authorization": ("Bearer " + api_key),
+}
+
+req = urllib.request.Request(url, body, headers)
+
+try:
+    response = urllib.request.urlopen(req)
+    raw_text = response.read().decode("utf-8")
+    payload = json.loads(raw_text)
+
+    if isinstance(payload, str):
+        payload = json.loads(payload)
+
+    if isinstance(payload, list) and payload:
+        for item in payload:
+            if isinstance(item, dict) and "transcription" in item:
+                print(item["transcription"])
+            else:
+                print(item)
+    else:
+        print(payload)
+
+except urllib.error.HTTPError as error:
+    print("The request failed with status code: " + str(error.code))
+
+    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
-    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
+    # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure
-    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
+    # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure
+    print(error.info())
+    print(error.read().decode("utf8", "ignore"))
diff --git a/apps/whisper_fine_tuning/deployment/endpoint/terraform/deploy.sh b/apps/whisper_fine_tuning/deployment/endpoint/terraform/deploy.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Check if terraform is installed; if not, install it
+if ! command -v terraform &>/dev/null; then
+  echo "Terraform not found. Installing Terraform..."
+  TERRAFORM_VERSION="1.5.0"  # Modify version as needed
+  TERRAFORM_ZIP="terraform_${TERRAFORM_VERSION}_linux_amd64.zip"
+  TEMP_DIR=$(mktemp -d)
+  wget "https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/${TERRAFORM_ZIP}" -O "${TEMP_DIR}/${TERRAFORM_ZIP}"
+  unzip "${TEMP_DIR}/${TERRAFORM_ZIP}" -d "${TEMP_DIR}"
+  sudo mv "${TEMP_DIR}/terraform" /usr/local/bin/
+  rm -rf "${TEMP_DIR}"
+  echo "Terraform installed successfully."
+fi
+
+# Deploy wrapper: loads .env and exports TF_VAR_<lowercase> variables for Terraform
+# Usage: place a .env in this folder (or edit .env.sample), then run ./deploy.sh
+
+ENV_FILE=".env"
+if [[ -f "$ENV_FILE" ]]; then
+  echo "Loading $ENV_FILE and exporting TF_VAR_* variables for Terraform"
+  while IFS='=' read -r raw_key raw_value || [[ -n "$raw_key" ]]; do
+    key="$raw_key"
+    value="$raw_value"
+    # skip comments and empty lines
+    [[ -z "${key// /}" ]] && continue
+    [[ "$key" =~ ^\s*# ]] && continue
+    # trim spaces
+    key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+    value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
+    # remove surrounding quotes if any
+    value=$(echo "$value" | sed 's/^"\(.*\)"$/\1/; s/^'\''\(.*\)'\''$/\1/')
+    # lowercase the key for TF_VAR convention
+    lc_key=$(echo "$key" | tr '[:upper:]' '[:lower:]')
+    export_var_name="TF_VAR_${lc_key}"
+    export ${export_var_name}="$value"
+    echo "Exported ${export_var_name}='${value}'"
+  done < "$ENV_FILE"
+else
+  echo "No $ENV_FILE found. Create one or pass -var flags to terraform."
+fi
+
+echo "Running terraform init && apply"
+terraform init
+terraform apply -auto-approve "$@"
diff --git a/apps/whisper_fine_tuning/deployment/endpoint/terraform/infer_example.json b/apps/whisper_fine_tuning/deployment/endpoint/terraform/infer_example.json
@@ -0,0 +1,16 @@
+{
+  "input_data": {
+    "columns": [
+      "input"
+    ],
+    "index": [
+      0
+    ],
+    "data": [
+      [
+      "https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3"
+    ]
+  ]
+  },
+  "params": {}
+}