diff --git a/README.md b/README.md index 1ac50ba..afc8ce4 100644 --- a/README.md +++ b/README.md @@ -13,12 +13,14 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai). |------------------|----------|------------|---------|---------|--------------| | Diffusers (fp16, SDPA) | 1.591s | 1.590s | 1.581s | 1.601s | 31.44 it/s | | Diffusers (fp16, xformers) | 1.758s | 1.759s | 1.746s | 1.772s | 28.43 it/s | +| Diffusers (fp16, SDPA, compiled) | 1.352s | 1.351s | 1.348s | 1.356s | 37.01 it/s | ### SDXL Benchmarks | | mean (s) | median (s) | min (s) | max (s) | speed (it/s) | |------------------|----------|------------|---------|---------|--------------| | Diffusers (fp16, SDPA) | 5.933s | 5.933s | 5.924s | 5.943s | 8.43 it/s | | Diffusers (fp16, xformers) | 5.724s | 5.724s | 5.714s | 5.731s | 8.74 it/s | +| Diffusers (fp16, SDPA, compiled) | 5.246s | 5.247s | 5.233s | 5.259s | 9.53 it/s | diff --git a/benchmarks/diffusers.py b/benchmarks/diffusers.py index 6aaa4ef..ea34c04 100644 --- a/benchmarks/diffusers.py +++ b/benchmarks/diffusers.py @@ -1,3 +1,4 @@ +import os from functools import partial import fal @@ -20,7 +21,14 @@ def diffusers_any( parameters: InputParameters, model_name: str, enable_xformers: bool = False, + compile: bool = False, ) -> BenchmarkResults: + # Some of these functionality might not be available in torch 2.1, + # but setting just in case if in the future we upgrade to a newer + # version of torch. + os.environ["TORCHINDUCTOR_CACHE_DIR"] = "/data/torch-cache" + os.environ["TORCHINDUCTOR_FX_GRAPH_CACHE"] = "1" + import torch from diffusers import DiffusionPipeline @@ -30,9 +38,21 @@ def diffusers_any( use_safetensors=True, ) pipeline.to("cuda") + + # Use XFormers memory efficient attention instead of Torch SDPA + # which might also utilize memory efficient attention (alongside + # flash attention). if enable_xformers: pipeline.enable_xformers_memory_efficient_attention() + # The mode here is reduce-overhead, which is a balanced compromise between + # compilation time and runtime. The other modes might be a possible choice + # for future benchmarks. + if compile: + pipeline.unet = torch.compile( + pipeline.unet, fullgraph=True, mode="reduce-overhead" + ) + return benchmark_settings.apply( partial( pipeline, @@ -60,6 +80,15 @@ def diffusers_any( "enable_xformers": True, }, }, + { + "name": "Diffusers (fp16, SDPA, compiled)", + "category": "SD1.5", + "function": diffusers_any, + "kwargs": { + "model_name": "runwayml/stable-diffusion-v1-5", + "compile": True, + }, + }, { "name": "Diffusers (fp16, SDPA)", "category": "SDXL", @@ -77,4 +106,13 @@ def diffusers_any( "enable_xformers": True, }, }, + { + "name": "Diffusers (fp16, SDPA, compiled)", + "category": "SDXL", + "function": diffusers_any, + "kwargs": { + "model_name": "stabilityai/stable-diffusion-xl-base-1.0", + "compile": True, + }, + }, ]