benchmarks: add torch.compile'd SD/SDXL

fal-ai · Nov 4, 2023 · a24d23a · a24d23a
1 parent aa3a059
commit a24d23a
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -13,12 +13,14 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai).
 |------------------|----------|------------|---------|---------|--------------|
 | Diffusers (fp16, SDPA) |   1.591s |     1.590s |  1.581s |  1.601s |   31.44 it/s |
 | Diffusers (fp16, xformers) |   1.758s |     1.759s |  1.746s |  1.772s |   28.43 it/s |
+| Diffusers (fp16, SDPA, compiled) |   1.352s |     1.351s |  1.348s |  1.356s |   37.01 it/s |
 
 ### SDXL Benchmarks
 |                  | mean (s) | median (s) | min (s) | max (s) | speed (it/s) |
 |------------------|----------|------------|---------|---------|--------------|
 | Diffusers (fp16, SDPA) |   5.933s |     5.933s |  5.924s |  5.943s |    8.43 it/s |
 | Diffusers (fp16, xformers) |   5.724s |     5.724s |  5.714s |  5.731s |    8.74 it/s |
+| Diffusers (fp16, SDPA, compiled) |   5.246s |     5.247s |  5.233s |  5.259s |    9.53 it/s |
 
 <!-- END TABLE -->
 

diff --git a/benchmarks/diffusers.py b/benchmarks/diffusers.py
@@ -1,3 +1,4 @@
+import os
 from functools import partial
 
 import fal
@@ -20,7 +21,14 @@ def diffusers_any(
     parameters: InputParameters,
     model_name: str,
     enable_xformers: bool = False,
+    compile: bool = False,
 ) -> BenchmarkResults:
+    # Some of these functionality might not be available in torch 2.1,
+    # but setting just in case if in the future we upgrade to a newer
+    # version of torch.
+    os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/data/torch-cache"
+    os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1"
+
     import torch
     from diffusers import DiffusionPipeline
 
@@ -30,9 +38,21 @@ def diffusers_any(
         use_safetensors=True,
     )
     pipeline.to("cuda")
+
+    # Use XFormers memory efficient attention instead of Torch SDPA
+    # which might also utilize memory efficient attention (alongside
+    # flash attention).
     if enable_xformers:
         pipeline.enable_xformers_memory_efficient_attention()
 
+    # The mode here is reduce-overhead, which is a balanced compromise between
+    # compilation time and runtime. The other modes might be a possible choice
+    # for future benchmarks.
+    if compile:
+        pipeline.unet = torch.compile(
+            pipeline.unet, fullgraph=True, mode="reduce-overhead"
+        )
+
     return benchmark_settings.apply(
         partial(
             pipeline,
@@ -60,6 +80,15 @@ def diffusers_any(
             "enable_xformers": True,
         },
     },
+    {
+        "name": "Diffusers (fp16, SDPA, compiled)",
+        "category": "SD1.5",
+        "function": diffusers_any,
+        "kwargs": {
+            "model_name": "runwayml/stable-diffusion-v1-5",
+            "compile": True,
+        },
+    },
     {
         "name": "Diffusers (fp16, SDPA)",
         "category": "SDXL",
@@ -77,4 +106,13 @@ def diffusers_any(
             "enable_xformers": True,
         },
     },
+    {
+        "name": "Diffusers (fp16, SDPA, compiled)",
+        "category": "SDXL",
+        "function": diffusers_any,
+        "kwargs": {
+            "model_name": "stabilityai/stable-diffusion-xl-base-1.0",
+            "compile": True,
+        },
+    },
 ]