fspecii · fspecii · Jan 25, 2026 · Jan 25, 2026
diff --git a/README.md b/README.md
@@ -106,6 +106,30 @@ Automatically configured based on your GPU:
 | NVIDIA SM 6.x and older (Pascal, Maxwell) | ❌ Disabled (uses math backend) |
 | AMD GPUs | ❌ Disabled (compatibility varies) |
 
+### 🔥 torch.compile (Experimental)
+Enable PyTorch 2.0+ compilation for **~2x faster inference** on supported GPUs:
+
+```bash
+# Enable torch.compile
+HEARTMULA_COMPILE=true python -m uvicorn backend.app.main:app --host 0.0.0.0 --port 8000
+
+# With max performance (slower first run, faster subsequent runs)
+HEARTMULA_COMPILE=true HEARTMULA_COMPILE_MODE=max-autotune python -m uvicorn backend.app.main:app --host 0.0.0.0 --port 8000
+```
+
+| Mode | Description |
+|------|-------------|
+| `default` | Good balance of compile time and performance |
+| `reduce-overhead` | Faster compilation, slightly less optimal code |
+| `max-autotune` | Best performance, but slowest compilation (recommended for production) |
+
+**Requirements:**
+- PyTorch 2.0+
+- **Linux/WSL2**: Install Triton (`pip install triton`)
+- **Windows**: Install Triton-Windows (`pip install -U 'triton-windows>=3.2,<3.3'`)
+
+> **Note:** First generation will be slower due to compilation. Subsequent generations benefit from the compiled kernels.
+
 ### 🎯 Smart Multi-GPU Detection
 Automatically selects the best GPU configuration:
 - **With 4-bit quantization**: Prioritizes fastest GPU (highest compute capability)
@@ -335,6 +359,8 @@ OLLAMA_HOST=http://localhost:11434
 | `HEARTMULA_MODEL_DIR` | `backend/models` | Custom model directory (share with ComfyUI, etc.) |
 | `HEARTMULA_4BIT` | `auto` | 4-bit quantization: `auto`, `true`, or `false` |
 | `HEARTMULA_SEQUENTIAL_OFFLOAD` | `auto` | Model swapping for low VRAM: `auto`, `true`, or `false` |
+| `HEARTMULA_COMPILE` | `false` | torch.compile for ~2x faster inference: `true` or `false` |
+| `HEARTMULA_COMPILE_MODE` | `default` | Compile mode: `default`, `reduce-overhead`, or `max-autotune` |
 | `HEARTMULA_VERSION` | `RL-3B-20260123` | Model version (latest RL-tuned model) |
 | `CUDA_VISIBLE_DEVICES` | all GPUs | Specify which GPUs to use (e.g., `0,1`) |
 

diff --git a/backend/.env.example b/backend/.env.example
@@ -1,3 +1,30 @@
 # OpenRouter API Key (optional - for cloud LLM access)
 # Get your key at https://openrouter.ai/
 OPENROUTER_API_KEY=your_openrouter_api_key_here
+
+# ===== GPU Configuration =====
+# Set to "true", "false", or "auto" (default: auto)
+# HEARTMULA_4BIT=auto
+
+# Set to "true" or "false" (default: auto-detected based on VRAM)
+# HEARTMULA_SEQUENTIAL_OFFLOAD=auto
+
+# ===== torch.compile (Performance Optimization) =====
+# Enable torch.compile for ~2x faster inference on supported GPUs
+# Requires PyTorch 2.0+ and Triton (pip install triton or triton-windows)
+# Set to "true" or "false" (default: false)
+# HEARTMULA_COMPILE=false
+
+# torch.compile mode: "default", "reduce-overhead", or "max-autotune"
+# - default: Good balance of compile time and performance
+# - reduce-overhead: Faster compilation, slightly less optimal code
+# - max-autotune: Best performance, but slowest compilation (recommended for production)
+# HEARTMULA_COMPILE_MODE=default
+
+# ===== Model Configuration =====
+# Model version to use (default: RL-3B-20260123)
+# Options: "3B", "RL-3B-20260123"
+# HEARTMULA_VERSION=RL-3B-20260123
+
+# Custom model directory (default: backend/models)
+# HEARTMULA_MODEL_DIR=/path/to/your/models
diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py
@@ -47,12 +47,18 @@
 # These can be set manually via environment variables, or left as "auto" for automatic detection
 # HEARTMULA_4BIT: "true", "false", or "auto" (default: auto)
 # HEARTMULA_SEQUENTIAL_OFFLOAD: "true", "false", or "auto" (default: auto)
+# HEARTMULA_COMPILE: "true" or "false" (default: false) - Enable torch.compile for ~2x faster inference
+# HEARTMULA_COMPILE_MODE: "default", "reduce-overhead", or "max-autotune" (default: default)
 _4BIT_ENV = os.environ.get("HEARTMULA_4BIT", "auto").lower()
 _OFFLOAD_ENV = os.environ.get("HEARTMULA_SEQUENTIAL_OFFLOAD", "auto").lower()
+_COMPILE_ENV = os.environ.get("HEARTMULA_COMPILE", "false").lower()
+_COMPILE_MODE_ENV = os.environ.get("HEARTMULA_COMPILE_MODE", "default").lower()
 
 # Manual overrides (if explicitly set to true/false)
 ENABLE_4BIT_QUANTIZATION = _4BIT_ENV == "true" if _4BIT_ENV != "auto" else None
 ENABLE_SEQUENTIAL_OFFLOAD = _OFFLOAD_ENV == "true" if _OFFLOAD_ENV != "auto" else None
+ENABLE_TORCH_COMPILE = _COMPILE_ENV == "true"
+TORCH_COMPILE_MODE = _COMPILE_MODE_ENV if _COMPILE_MODE_ENV in ["default", "reduce-overhead", "max-autotune"] else "default"
 
 # VRAM thresholds for auto-detection (in GB)
 VRAM_THRESHOLD_FULL_PRECISION = 20.0  # Can fit HeartMuLa (~11GB) + HeartCodec (~6GB) + KV cache (~4GB)
@@ -315,6 +321,65 @@ def ensure_models_downloaded(model_dir: str = DEFAULT_MODEL_DIR, version: str =
     return model_dir
 
 
+def apply_torch_compile(model, compile_mode: str = "default"):
+    """
+    Apply torch.compile to HeartMuLa model for faster inference.
+
+    This can provide ~2x speedup on supported GPUs (tested on RTX 4090, A100).
+    First run will be slower due to compilation, but subsequent runs are faster.
+
+    Args:
+        model: HeartMuLa model instance
+        compile_mode: One of "default", "reduce-overhead", or "max-autotune"
+
+    Returns:
+        The compiled model, or original model if compilation fails
+    """
+    if not ENABLE_TORCH_COMPILE:
+        return model
+
+    try:
+        # Check if triton is available for optimal performance
+        try:
+            import triton
+            backend = "inductor"
+            print(f"[torch.compile] Triton found - using inductor backend for optimal performance", flush=True)
+        except ImportError:
+            import warnings
+            warnings.warn(
+                "Triton not found. On Windows, install triton-windows for best performance: "
+                "pip install -U 'triton-windows>=3.2,<3.3'. Falling back to eager backend."
+            )
+            backend = "eager"
+            print(f"[torch.compile] Triton not found - using eager backend (slower)", flush=True)
+
+        print(f"[torch.compile] Compiling HeartMuLa model (mode={compile_mode}, backend={backend})...", flush=True)
+        print(f"[torch.compile] Note: First generation will be slower due to compilation.", flush=True)
+
+        # Compile backbone and decoder
+        model.backbone = torch.compile(
+            model.backbone,
+            backend=backend,
+            mode=compile_mode,
+            dynamic=True,
+        )
+        model.decoder = torch.compile(
+            model.decoder,
+            backend=backend,
+            mode=compile_mode,
+            dynamic=True,
+        )
+
+        print(f"[torch.compile] Model compiled successfully!", flush=True)
+        return model
+
+    except Exception as e:
+        import warnings
+        warnings.warn(f"torch.compile failed ({e}), continuing without compilation")
+        print(f"[torch.compile] Compilation failed: {e}. Continuing without torch.compile.", flush=True)
+        return model
+
+
 def configure_flash_attention_for_gpu(device_id: int):
     """
     Configure Flash Attention based on GPU compute capability.
@@ -376,6 +441,8 @@ def create_quantized_pipeline(
     mula_device: torch.device,
     codec_device: torch.device,
     lazy_codec: bool = False,
+    compile_model: bool = False,
+    compile_mode: str = "default",
 ) -> HeartMuLaGenPipeline:
     """
     Create a HeartMuLa pipeline with 4-bit quantization for reduced VRAM usage.
@@ -388,6 +455,8 @@ def create_quantized_pipeline(
         codec_device: Device for HeartCodec model
         lazy_codec: If True, don't load HeartCodec upfront - load only when needed for decoding.
                     This allows fitting on 12GB GPUs by never having both models in VRAM.
+        compile_model: If True, apply torch.compile for faster inference
+        compile_mode: torch.compile mode ("default", "reduce-overhead", "max-autotune")
     """
     from heartlib.pipelines.music_generation import _resolve_paths
 
@@ -412,6 +481,10 @@ def create_quantized_pipeline(
         torch_dtype=torch.bfloat16,
         quantization_config=bnb_config,
     )
+
+    # Apply torch.compile if enabled
+    if compile_model:
+        heartmula = apply_torch_compile(heartmula, compile_mode)
 
     heartcodec = None
     if not lazy_codec:
@@ -667,6 +740,14 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
             use_sequential_offload = auto_config["use_sequential_offload"]
             print(f"[Config] Auto-detected: sequential offload = {use_sequential_offload}", flush=True)
 
+        # torch.compile settings
+        use_compile = ENABLE_TORCH_COMPILE
+        compile_mode = TORCH_COMPILE_MODE
+        if use_compile:
+            print(f"[Config] torch.compile ENABLED (mode={compile_mode})", flush=True)
+        else:
+            print(f"[Config] torch.compile DISABLED", flush=True)
+
         # Store the detected config for reference
         self.gpu_config = auto_config
 
@@ -695,6 +776,8 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
                         mula_device=torch.device("cuda"),
                         codec_device=torch.device("cuda"),
                         lazy_codec=True,  # Don't load HeartCodec upfront
+                        compile_model=use_compile,
+                        compile_mode=compile_mode,
                     )
                     return patch_pipeline_with_callback(pipeline, sequential_offload=True)
                 else:
@@ -705,6 +788,8 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
                         mula_device=torch.device("cuda"),
                         codec_device=torch.device("cuda"),
                         lazy_codec=False,
+                        compile_model=use_compile,
+                        compile_mode=compile_mode,
                     )
                     return patch_pipeline_with_callback(pipeline, sequential_offload=False)
             elif use_sequential_offload:
@@ -724,6 +809,9 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
                     version=version,
                     lazy_load=True,
                 )
+                # Apply torch.compile if enabled
+                if use_compile:
+                    pipeline._mula = apply_torch_compile(pipeline._mula, compile_mode)
                 return patch_pipeline_with_callback(pipeline, sequential_offload=True)
             else:
                 # Without quantization, use lazy loading - codec stays on CPU
@@ -740,6 +828,9 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
                     version=version,
                     lazy_load=True,
                 )
+                # Apply torch.compile if enabled
+                if use_compile:
+                    pipeline._mula = apply_torch_compile(pipeline._mula, compile_mode)
                 return patch_pipeline_with_callback(pipeline, sequential_offload=False)
 
         # Multi-GPU setup
@@ -782,6 +873,8 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
                 model_path, version,
                 mula_device=torch.device(f"cuda:{mula_gpu}"),
                 codec_device=torch.device(f"cuda:{codec_gpu}"),
+                compile_model=use_compile,
+                compile_mode=compile_mode,
             )
         else:
             pipeline = HeartMuLaGenPipeline.from_pretrained(
@@ -796,6 +889,9 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
                 },
                 version=version,
             )
+            # Apply torch.compile if enabled
+            if use_compile:
+                pipeline._mula = apply_torch_compile(pipeline._mula, compile_mode)
         return patch_pipeline_with_callback(pipeline, sequential_offload=False)
 
     async def initialize(self, model_path: Optional[str] = None, version: str = None):