Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,30 @@ Automatically configured based on your GPU:
| NVIDIA SM 6.x and older (Pascal, Maxwell) | ❌ Disabled (uses math backend) |
| AMD GPUs | ❌ Disabled (compatibility varies) |

### 🔥 torch.compile (Experimental)
Enable PyTorch 2.0+ compilation for **~2x faster inference** on supported GPUs:

```bash
# Enable torch.compile
HEARTMULA_COMPILE=true python -m uvicorn backend.app.main:app --host 0.0.0.0 --port 8000

# With max performance (slower first run, faster subsequent runs)
HEARTMULA_COMPILE=true HEARTMULA_COMPILE_MODE=max-autotune python -m uvicorn backend.app.main:app --host 0.0.0.0 --port 8000
```

| Mode | Description |
|------|-------------|
| `default` | Good balance of compile time and performance |
| `reduce-overhead` | Faster compilation, slightly less optimal code |
| `max-autotune` | Best performance, but slowest compilation (recommended for production) |

**Requirements:**
- PyTorch 2.0+
- **Linux/WSL2**: Install Triton (`pip install triton`)
- **Windows**: Install Triton-Windows (`pip install -U 'triton-windows>=3.2,<3.3'`)

> **Note:** First generation will be slower due to compilation. Subsequent generations benefit from the compiled kernels.

### 🎯 Smart Multi-GPU Detection
Automatically selects the best GPU configuration:
- **With 4-bit quantization**: Prioritizes fastest GPU (highest compute capability)
Expand Down Expand Up @@ -335,6 +359,8 @@ OLLAMA_HOST=http://localhost:11434
| `HEARTMULA_MODEL_DIR` | `backend/models` | Custom model directory (share with ComfyUI, etc.) |
| `HEARTMULA_4BIT` | `auto` | 4-bit quantization: `auto`, `true`, or `false` |
| `HEARTMULA_SEQUENTIAL_OFFLOAD` | `auto` | Model swapping for low VRAM: `auto`, `true`, or `false` |
| `HEARTMULA_COMPILE` | `false` | torch.compile for ~2x faster inference: `true` or `false` |
| `HEARTMULA_COMPILE_MODE` | `default` | Compile mode: `default`, `reduce-overhead`, or `max-autotune` |
| `HEARTMULA_VERSION` | `RL-3B-20260123` | Model version (latest RL-tuned model) |
| `CUDA_VISIBLE_DEVICES` | all GPUs | Specify which GPUs to use (e.g., `0,1`) |

Expand Down
27 changes: 27 additions & 0 deletions backend/.env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,30 @@
# OpenRouter API Key (optional - for cloud LLM access)
# Get your key at https://openrouter.ai/
OPENROUTER_API_KEY=your_openrouter_api_key_here

# ===== GPU Configuration =====
# Set to "true", "false", or "auto" (default: auto)
# HEARTMULA_4BIT=auto

# Set to "true" or "false" (default: auto-detected based on VRAM)
# HEARTMULA_SEQUENTIAL_OFFLOAD=auto

# ===== torch.compile (Performance Optimization) =====
# Enable torch.compile for ~2x faster inference on supported GPUs
# Requires PyTorch 2.0+ and Triton (pip install triton or triton-windows)
# Set to "true" or "false" (default: false)
# HEARTMULA_COMPILE=false

# torch.compile mode: "default", "reduce-overhead", or "max-autotune"
# - default: Good balance of compile time and performance
# - reduce-overhead: Faster compilation, slightly less optimal code
# - max-autotune: Best performance, but slowest compilation (recommended for production)
# HEARTMULA_COMPILE_MODE=default

# ===== Model Configuration =====
# Model version to use (default: RL-3B-20260123)
# Options: "3B", "RL-3B-20260123"
# HEARTMULA_VERSION=RL-3B-20260123

# Custom model directory (default: backend/models)
# HEARTMULA_MODEL_DIR=/path/to/your/models
96 changes: 96 additions & 0 deletions backend/app/services/music_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,18 @@
# These can be set manually via environment variables, or left as "auto" for automatic detection
# HEARTMULA_4BIT: "true", "false", or "auto" (default: auto)
# HEARTMULA_SEQUENTIAL_OFFLOAD: "true", "false", or "auto" (default: auto)
# HEARTMULA_COMPILE: "true" or "false" (default: false) - Enable torch.compile for ~2x faster inference
# HEARTMULA_COMPILE_MODE: "default", "reduce-overhead", or "max-autotune" (default: default)
_4BIT_ENV = os.environ.get("HEARTMULA_4BIT", "auto").lower()
_OFFLOAD_ENV = os.environ.get("HEARTMULA_SEQUENTIAL_OFFLOAD", "auto").lower()
_COMPILE_ENV = os.environ.get("HEARTMULA_COMPILE", "false").lower()
_COMPILE_MODE_ENV = os.environ.get("HEARTMULA_COMPILE_MODE", "default").lower()

# Manual overrides (if explicitly set to true/false)
ENABLE_4BIT_QUANTIZATION = _4BIT_ENV == "true" if _4BIT_ENV != "auto" else None
ENABLE_SEQUENTIAL_OFFLOAD = _OFFLOAD_ENV == "true" if _OFFLOAD_ENV != "auto" else None
ENABLE_TORCH_COMPILE = _COMPILE_ENV == "true"
TORCH_COMPILE_MODE = _COMPILE_MODE_ENV if _COMPILE_MODE_ENV in ["default", "reduce-overhead", "max-autotune"] else "default"

# VRAM thresholds for auto-detection (in GB)
VRAM_THRESHOLD_FULL_PRECISION = 20.0 # Can fit HeartMuLa (~11GB) + HeartCodec (~6GB) + KV cache (~4GB)
Expand Down Expand Up @@ -315,6 +321,65 @@ def ensure_models_downloaded(model_dir: str = DEFAULT_MODEL_DIR, version: str =
return model_dir


def apply_torch_compile(model, compile_mode: str = "default"):
"""
Apply torch.compile to HeartMuLa model for faster inference.

This can provide ~2x speedup on supported GPUs (tested on RTX 4090, A100).
First run will be slower due to compilation, but subsequent runs are faster.

Args:
model: HeartMuLa model instance
compile_mode: One of "default", "reduce-overhead", or "max-autotune"

Returns:
The compiled model, or original model if compilation fails
"""
if not ENABLE_TORCH_COMPILE:
return model

try:
# Check if triton is available for optimal performance
try:
import triton
backend = "inductor"
print(f"[torch.compile] Triton found - using inductor backend for optimal performance", flush=True)
except ImportError:
import warnings
warnings.warn(
"Triton not found. On Windows, install triton-windows for best performance: "
"pip install -U 'triton-windows>=3.2,<3.3'. Falling back to eager backend."
)
backend = "eager"
print(f"[torch.compile] Triton not found - using eager backend (slower)", flush=True)

print(f"[torch.compile] Compiling HeartMuLa model (mode={compile_mode}, backend={backend})...", flush=True)
print(f"[torch.compile] Note: First generation will be slower due to compilation.", flush=True)

# Compile backbone and decoder
model.backbone = torch.compile(
model.backbone,
backend=backend,
mode=compile_mode,
dynamic=True,
)
model.decoder = torch.compile(
model.decoder,
backend=backend,
mode=compile_mode,
dynamic=True,
)

print(f"[torch.compile] Model compiled successfully!", flush=True)
return model

except Exception as e:
import warnings
warnings.warn(f"torch.compile failed ({e}), continuing without compilation")
print(f"[torch.compile] Compilation failed: {e}. Continuing without torch.compile.", flush=True)
return model


def configure_flash_attention_for_gpu(device_id: int):
"""
Configure Flash Attention based on GPU compute capability.
Expand Down Expand Up @@ -376,6 +441,8 @@ def create_quantized_pipeline(
mula_device: torch.device,
codec_device: torch.device,
lazy_codec: bool = False,
compile_model: bool = False,
compile_mode: str = "default",
) -> HeartMuLaGenPipeline:
"""
Create a HeartMuLa pipeline with 4-bit quantization for reduced VRAM usage.
Expand All @@ -388,6 +455,8 @@ def create_quantized_pipeline(
codec_device: Device for HeartCodec model
lazy_codec: If True, don't load HeartCodec upfront - load only when needed for decoding.
This allows fitting on 12GB GPUs by never having both models in VRAM.
compile_model: If True, apply torch.compile for faster inference
compile_mode: torch.compile mode ("default", "reduce-overhead", "max-autotune")
"""
from heartlib.pipelines.music_generation import _resolve_paths

Expand All @@ -412,6 +481,10 @@ def create_quantized_pipeline(
torch_dtype=torch.bfloat16,
quantization_config=bnb_config,
)

# Apply torch.compile if enabled
if compile_model:
heartmula = apply_torch_compile(heartmula, compile_mode)

heartcodec = None
if not lazy_codec:
Expand Down Expand Up @@ -667,6 +740,14 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
use_sequential_offload = auto_config["use_sequential_offload"]
print(f"[Config] Auto-detected: sequential offload = {use_sequential_offload}", flush=True)

# torch.compile settings
use_compile = ENABLE_TORCH_COMPILE
compile_mode = TORCH_COMPILE_MODE
if use_compile:
print(f"[Config] torch.compile ENABLED (mode={compile_mode})", flush=True)
else:
print(f"[Config] torch.compile DISABLED", flush=True)

# Store the detected config for reference
self.gpu_config = auto_config

Expand Down Expand Up @@ -695,6 +776,8 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
mula_device=torch.device("cuda"),
codec_device=torch.device("cuda"),
lazy_codec=True, # Don't load HeartCodec upfront
compile_model=use_compile,
compile_mode=compile_mode,
)
return patch_pipeline_with_callback(pipeline, sequential_offload=True)
else:
Expand All @@ -705,6 +788,8 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
mula_device=torch.device("cuda"),
codec_device=torch.device("cuda"),
lazy_codec=False,
compile_model=use_compile,
compile_mode=compile_mode,
)
return patch_pipeline_with_callback(pipeline, sequential_offload=False)
elif use_sequential_offload:
Expand All @@ -724,6 +809,9 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
version=version,
lazy_load=True,
)
# Apply torch.compile if enabled
if use_compile:
pipeline._mula = apply_torch_compile(pipeline._mula, compile_mode)
return patch_pipeline_with_callback(pipeline, sequential_offload=True)
else:
# Without quantization, use lazy loading - codec stays on CPU
Expand All @@ -740,6 +828,9 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
version=version,
lazy_load=True,
)
# Apply torch.compile if enabled
if use_compile:
pipeline._mula = apply_torch_compile(pipeline._mula, compile_mode)
return patch_pipeline_with_callback(pipeline, sequential_offload=False)

# Multi-GPU setup
Expand Down Expand Up @@ -782,6 +873,8 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
model_path, version,
mula_device=torch.device(f"cuda:{mula_gpu}"),
codec_device=torch.device(f"cuda:{codec_gpu}"),
compile_model=use_compile,
compile_mode=compile_mode,
)
else:
pipeline = HeartMuLaGenPipeline.from_pretrained(
Expand All @@ -796,6 +889,9 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
},
version=version,
)
# Apply torch.compile if enabled
if use_compile:
pipeline._mula = apply_torch_compile(pipeline._mula, compile_mode)
return patch_pipeline_with_callback(pipeline, sequential_offload=False)

async def initialize(self, model_path: Optional[str] = None, version: str = None):
Expand Down