Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .github/workflows/pre-commit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: pre-commit
on: [pull_request, push]
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install pre-commit
run: python -m pip install --upgrade pre-commit
- name: Run pre-commit
run: pre-commit run --all-files
22 changes: 21 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -427,4 +427,24 @@ transliteration*
.venv*
infra/target*
.vscode*
junk/
junk/
*.code-workspace

# Machine learning artifacts and cache directories
mlruns/
custom_data/
output_model_dir/
nemo_rnnt_da/
training_console.log
cache*
.mypy_cache/
.ruff_cache/

# Local dataset directories
apps/whisper_fine_tuning/data/

*.DS_Store

data/

predictions_dir/
44 changes: 44 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
repos:
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
- id: black
language_version: python3.11 # require Python 3.11 or newer

- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.1.0
hooks:
- id: ruff
args: [--fix, --extend-ignore, E402] # ruff will auto-fix many issues
additional_dependencies: []

- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
hooks:
- id: isort
args: ["--profile", "black"]

- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
- id: trailing-whitespace
- id: check-yaml
- id: check-added-large-files


- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.6.1 # pick a valid tag you confirmed with git ls-remote
hooks:
- id: mypy
# keep commonly useful flags, then selectively disable error codes reported by mypy
args:
- --ignore-missing-imports
- --disable-error-code=import-untyped
- --disable-error-code=call-arg
- --disable-error-code=union-attr
- --disable-error-code=arg-type
- --disable-error-code=used-before-def
- --disable-error-code=attr-defined
files: \.py$
language_version: python3.11
49 changes: 49 additions & 0 deletions apps/whisper_fine_tuning/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
export HOME := $(HOME)
.ONESHELL:

ifeq ($(OS),Windows_NT)
SHELL = cmd
CONDA_ACTIVATE = call %CONDA_PREFIX%\Scripts\activate.bat
else
SHELL = /bin/bash
CONDA_ACTIVATE = source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
endif

setup_aml:
rm -rf ~/.pyenv
curl https://pyenv.run | bash
$(HOME)/.pyenv/bin/pyenv --version
$(HOME)/.pyenv/bin/pyenv install 3.12 --skip-existing
$(HOME)/.pyenv/bin/pyenv local 3.12
python --version
conda create -n condav0 python=3.12
$(CONDA_ACTIVATE) activate condav0
conda install -c conda-forge poetry
poetry config virtualenvs.create true
poetry config virtualenvs.in-project true
poetry lock --no-update
poetry install
conda install pip
conda install -c conda-forge "ffmpeg>=5,<7"
sudo apt update && sudo apt install -y ffmpeg
python -m ipykernel install --user --name condav0 --display-name "condav0"


USERPROFILE := $(USERPROFILE)
CURRENT_DIR := $(shell cd)
setup_win:
if exist %USERPROFILE%\.pyenv rmdir /s /q %USERPROFILE%\.pyenv
git clone https://github.com/pyenv-win/pyenv-win.git "%USERPROFILE%\.pyenv"
$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv --version
$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv install 3.12 --skip-existing
$(USERPROFILE)\.pyenv\pyenv-win\bin\pyenv local 3.12
python --version
python -m venv venv
echo $(CURRENT_DIR)
"$(CURRENT_DIR)/venv/Scripts/activate"
pip install poetry
poetry config virtualenvs.create true
poetry config virtualenvs.in-project true
poetry lock
poetry install
conda install pip
111 changes: 111 additions & 0 deletions apps/whisper_fine_tuning/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Whisper Fine-Tuning Pipeline

This project provides an end-to-end workflow for preparing custom audio datasets, fine-tuning OpenAI’s Whisper models with LoRA/PEFT, and operationalizing the resulting checkpoints on Azure ML.

## Highlights
- LoRA-based fine-tuning scripts that minimize GPU memory requirements.
- Data ingestion utilities that convert raw audio into Hugging Face `datasets` format.
- MLflow integration for experiment tracking and artifact storage.
- Azure ML job definitions for cloud-scale training and evaluation.

## Project Layout
```
apps/whisper_fine_tuning/
├── deployment/ # Azure ML job specs and training entrypoints
├── docs/ # Diagrams and supporting documentation
├── infra/ # Infrastructure-as-code templates
├── notebooks/ # Exploratory analysis and inference notebooks
├── src/core/ # Data prep, training, and evaluation modules
└── data/ # Example datasets (raw/silver) – ignored in git
```

## Quick Start
1. **Install dependencies**
```bash
poetry env use 3.12
poetry install
```
Activate the environment with `poetry shell` or your preferred virtualenv tool.

2. **Prepare raw audio**
```bash
python src/core/data_prep/main_data_prep.py \
--source_data_dir data/raw/audios/matis \
--output_data_dir data/raw/training \
--domain train
```
Repeat for `--domain evaluation` and `--domain test`. See `src/core/data_prep/README.md` for more options.

3. **Generate silver dataset**
```bash
python src/core/data_prep/main_silver_data_prep.py \
--train_datasets data/raw/training \
--eval_datasets data/raw/evaluation \
--test_datasets data/raw/testing
```

4. **Train Whisper with LoRA**
```bash
python src/core/train/main_train.py \
--model_name openai/whisper-large-v2 \
--dataset data/silver/dataset \
--language Matis \
--output_dir output_model_dir \
--apply_lora True
```

## Running on Azure ML
Submit the packaged job using the provided YAML spec:
```bash
az ml job create --file deployment/training_job.yaml
```
Customize compute, environment, and inputs inside the YAML before submission.

## Evaluation & Inference
- Offline evaluation: `python src/core/evaluation/evaluation_process.py --eval_datasets data/raw/testing ...`
- NeMo experiments: `python src/core/train/main_train_nemo.py --dataset_path data/silver/dataset ...`
- Inference notebook: `notebooks/fine_tuned_usage.ipynb`

## Key CLI Arguments
| Script | Argument | Description |
| --- | --- | --- |
| `main_data_prep.py` | `--source_data_dir` | Folder with `audio_paths` and `text` mappings |
| | `--domain` | `train`, `test`, `evaluation`, or `all` |
| `main_silver_data_prep.py` | `--train_datasets`/`--eval_datasets` | Hugging Face datasets produced by the raw prep stage |
| `main_train.py` | `--model_name` | Base Whisper checkpoint (default `openai/whisper-small`) |
| | `--apply_lora` | Enable/disable LoRA adapters |
| | `--experiment_name` | MLflow experiment name; auto-generated if omitted |

Full option lists live in each script’s `--help` output.

## MLflow & Logging
- Runs log configuration, metrics, and console output (see `training_console.log`).
- Checkpoints are written to `output_model_dir/` and can be registered with MLflow or uploaded to Azure.

## Pre-Commit Hooks
```
pip install pre-commit
pre-commit install
pre-commit run --all-files
```
Hooks enforce formatting and linting before changes land in version control.

## Troubleshooting
- Verify `environment.yml` or `pyproject.toml` dependencies are installed.
- Ensure datasets follow the expected directory structure (`data/raw/audios/...`).
- For Azure ML issues, confirm workspace credentials and compute targets.

## Fine-Tuning Tips
| Parameter | Recommended | Why |
| --- | --- | --- |
| `per_device_train_batch_size` | 4–8 | Small datasets benefit from smaller batches |
| `num_train_epochs` | 10–30 | Compensate for limited data with more passes |
| `warmup_steps` | 50–100 | Faster ramp-up than large fixed counts |
| `eval_strategy` | `epoch` | Evaluate once per epoch to avoid noise |
| `save_strategy` | `epoch` | Align checkpointing with evaluation |
| `load_best_model_at_end` | `True` | Automatically keep the best-performing checkpoint |

Additional guidance on data augmentation, freezing layers, early stopping, and MLflow logging is summarized in `src/core/train/README.md`.

## License
Licensed under the MIT License. Contributions and issues are welcome.
70 changes: 70 additions & 0 deletions apps/whisper_fine_tuning/deployment/endpoint/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import urllib.request
import json
from dotenv import load_dotenv
import os
import base64, json, os, pathlib, requests


# Request data goes here
# The example below assumes JSON formatting which may be updated
# depending on the format your endpoint expects.
# More information can be found here:
# https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
load_dotenv() # Load environment variables from a .env file

audio_path = pathlib.Path(
"/Users/karinaassiniandreatta/Documents/06 microsoft_codes/azuresamples/whisper-fine-tuning/src/core/create_data/matis/train_data/audio2.ogg"
)
audio_b64 = base64.b64encode(audio_path.read_bytes()).decode("utf-8")

payload = {
"input_data": {
"columns": ["audio_base64"],
"index": [0],
"data": [[audio_b64]],
},
"params": {},
}
body = str.encode(json.dumps(payload))

url = os.getenv(
"SCORE_ENDPOINT",
"https://ml-sandbox-core-hdydq.eastus2.inference.ml.azure.com/score",
)
# Replace this with the primary/secondary key, AMLToken, or Microsoft Entra ID token for the endpoint
api_key = os.getenv("API_KEY")
if not api_key:
raise Exception("A key should be provided to invoke the endpoint")


headers = {
"Content-Type": "application/json",
"Accept": "application/json",
"Authorization": ("Bearer " + api_key),
}

req = urllib.request.Request(url, body, headers)

try:
response = urllib.request.urlopen(req)
raw_text = response.read().decode("utf-8")
payload = json.loads(raw_text)

if isinstance(payload, str):
payload = json.loads(payload)

if isinstance(payload, list) and payload:
for item in payload:
if isinstance(item, dict) and "transcription" in item:
print(item["transcription"])
else:
print(item)
else:
print(payload)

except urllib.error.HTTPError as error:
print("The request failed with status code: " + str(error.code))

# Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
Copy link

Copilot AI Nov 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'requert' to 'request'.

Suggested change
# Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
# Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure

Copilot uses AI. Check for mistakes.
print(error.info())
print(error.read().decode("utf8", "ignore"))
46 changes: 46 additions & 0 deletions apps/whisper_fine_tuning/deployment/endpoint/terraform/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail

# Check if terraform is installed; if not, install it
if ! command -v terraform &>/dev/null; then
echo "Terraform not found. Installing Terraform..."
TERRAFORM_VERSION="1.5.0" # Modify version as needed
TERRAFORM_ZIP="terraform_${TERRAFORM_VERSION}_linux_amd64.zip"
TEMP_DIR=$(mktemp -d)
wget "https://releases.hashicorp.com/terraform/${TERRAFORM_VERSION}/${TERRAFORM_ZIP}" -O "${TEMP_DIR}/${TERRAFORM_ZIP}"
unzip "${TEMP_DIR}/${TERRAFORM_ZIP}" -d "${TEMP_DIR}"
sudo mv "${TEMP_DIR}/terraform" /usr/local/bin/
rm -rf "${TEMP_DIR}"
echo "Terraform installed successfully."
fi

# Deploy wrapper: loads .env and exports TF_VAR_<lowercase> variables for Terraform
# Usage: place a .env in this folder (or edit .env.sample), then run ./deploy.sh

ENV_FILE=".env"
if [[ -f "$ENV_FILE" ]]; then
echo "Loading $ENV_FILE and exporting TF_VAR_* variables for Terraform"
while IFS='=' read -r raw_key raw_value || [[ -n "$raw_key" ]]; do
key="$raw_key"
value="$raw_value"
# skip comments and empty lines
[[ -z "${key// /}" ]] && continue
[[ "$key" =~ ^\s*# ]] && continue
# trim spaces
key=$(echo "$key" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
value=$(echo "$value" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')
# remove surrounding quotes if any
value=$(echo "$value" | sed 's/^"\(.*\)"$/\1/; s/^'\''\(.*\)'\''$/\1/')
# lowercase the key for TF_VAR convention
lc_key=$(echo "$key" | tr '[:upper:]' '[:lower:]')
export_var_name="TF_VAR_${lc_key}"
export ${export_var_name}="$value"
echo "Exported ${export_var_name}='${value}'"
done < "$ENV_FILE"
else
echo "No $ENV_FILE found. Create one or pass -var flags to terraform."
fi

echo "Running terraform init && apply"
terraform init
terraform apply -auto-approve "$@"
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"input_data": {
"columns": [
"input"
],
"index": [
0
],
"data": [
[
"https://www.soundhelix.com/examples/mp3/SoundHelix-Song-1.mp3"
]
]
},
"params": {}
}
Loading
Loading