benchmarks: add NCHW channels last + compiled combination

fal-ai · Nov 4, 2023 · 64babfd · 64babfd
1 parent a24d23a
commit 64babfd
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -14,13 +14,15 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai).
 | Diffusers (fp16, SDPA) |   1.591s |     1.590s |  1.581s |  1.601s |   31.44 it/s |
 | Diffusers (fp16, xformers) |   1.758s |     1.759s |  1.746s |  1.772s |   28.43 it/s |
 | Diffusers (fp16, SDPA, compiled) |   1.352s |     1.351s |  1.348s |  1.356s |   37.01 it/s |
+| Diffusers (fp16, SDPA, compiled, NCHW channels last) |   1.066s |     1.065s |  1.062s |  1.076s |   46.95 it/s |
 
 ### SDXL Benchmarks
 |                  | mean (s) | median (s) | min (s) | max (s) | speed (it/s) |
 |------------------|----------|------------|---------|---------|--------------|
 | Diffusers (fp16, SDPA) |   5.933s |     5.933s |  5.924s |  5.943s |    8.43 it/s |
 | Diffusers (fp16, xformers) |   5.724s |     5.724s |  5.714s |  5.731s |    8.74 it/s |
 | Diffusers (fp16, SDPA, compiled) |   5.246s |     5.247s |  5.233s |  5.259s |    9.53 it/s |
+| Diffusers (fp16, SDPA, compiled, NCHW channels last) |   5.132s |     5.132s |  5.121s |  5.142s |    9.74 it/s |
 
 <!-- END TABLE -->
 

diff --git a/benchmarks/diffusers.py b/benchmarks/diffusers.py
@@ -21,7 +21,8 @@ def diffusers_any(
     parameters: InputParameters,
     model_name: str,
     enable_xformers: bool = False,
-    compile: bool = False,
+    use_compile: bool = False,
+    use_nchw_channels: bool = False,
 ) -> BenchmarkResults:
     # Some of these functionality might not be available in torch 2.1,
     # but setting just in case if in the future we upgrade to a newer
@@ -45,10 +46,13 @@ def diffusers_any(
     if enable_xformers:
         pipeline.enable_xformers_memory_efficient_attention()
 
+    if use_nchw_channels:
+        pipeline.unet = pipeline.unet.to(memory_format=torch.channels_last)
+
     # The mode here is reduce-overhead, which is a balanced compromise between
     # compilation time and runtime. The other modes might be a possible choice
     # for future benchmarks.
-    if compile:
+    if use_compile:
         pipeline.unet = torch.compile(
             pipeline.unet, fullgraph=True, mode="reduce-overhead"
         )
@@ -86,7 +90,17 @@ def diffusers_any(
         "function": diffusers_any,
         "kwargs": {
             "model_name": "runwayml/stable-diffusion-v1-5",
-            "compile": True,
+            "use_compile": True,
+        },
+    },
+    {
+        "name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)",
+        "category": "SD1.5",
+        "function": diffusers_any,
+        "kwargs": {
+            "model_name": "runwayml/stable-diffusion-v1-5",
+            "use_compile": True,
+            "use_nchw_channels": True,
         },
     },
     {
@@ -112,7 +126,17 @@ def diffusers_any(
         "function": diffusers_any,
         "kwargs": {
             "model_name": "stabilityai/stable-diffusion-xl-base-1.0",
-            "compile": True,
+            "use_compile": True,
+        },
+    },
+    {
+        "name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)",
+        "category": "SDXL",
+        "function": diffusers_any,
+        "kwargs": {
+            "model_name": "stabilityai/stable-diffusion-xl-base-1.0",
+            "use_compile": True,
+            "use_nchw_channels": True,
         },
     },
 ]