diff --git a/README.md b/README.md index e53b853..b75ed15 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai). | Diffusers (fp16, SDPA) | 1.591s | 1.590s | 1.581s | 1.601s | 31.44 it/s | | Diffusers (fp16, SDPA, compiled) | 1.352s | 1.351s | 1.348s | 1.356s | 37.01 it/s | | Diffusers (fp16, SDPA, compiled, NCHW channels last) | 1.066s | 1.065s | 1.062s | 1.076s | 46.95 it/s | +| OneFlow | 0.951s | 0.953s | 0.941s | 0.957s | 52.48 it/s | | TensorRT 9.0 (cuda graphs, static shapes) | 0.819s | 0.818s | 0.817s | 0.821s | 61.14 it/s | ### SDXL Benchmarks @@ -24,6 +25,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai). | Diffusers (fp16, xformers) | 5.724s | 5.724s | 5.714s | 5.731s | 8.74 it/s | | Diffusers (fp16, SDPA, compiled) | 5.246s | 5.247s | 5.233s | 5.259s | 9.53 it/s | | Diffusers (fp16, SDPA, compiled, NCHW channels last) | 5.132s | 5.132s | 5.121s | 5.142s | 9.74 it/s | +| OneFlow | 4.605s | 4.607s | 4.581s | 4.625s | 10.85 it/s | | TensorRT 9.0 (cuda graphs, static shapes) | 4.102s | 4.104s | 4.091s | 4.107s | 12.18 it/s | diff --git a/artifacts/latest.json b/artifacts/latest.json index f396b15..da2869e 100644 --- a/artifacts/latest.json +++ b/artifacts/latest.json @@ -1 +1 @@ -{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}]} +{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}, {"name": "OneFlow", "category": "SD1.5", "timings": [0.9568120219919365, 0.9468847009993624, 0.9545126229932066, 0.9472718389879446, 0.9552929110068362, 0.9412291230109986, 0.9544001989997923, 0.9557115529896691, 0.9509655799774919, 0.9482630330021493]}, {"name": "OneFlow", "category": "SDXL", "timings": [4.586631373997079, 4.61347366499831, 4.600411992985755, 4.6092570440087, 4.611457958992105, 4.604861573025119, 4.602566407003906, 4.624956093000947, 4.615925558988238, 4.580915530998027]}]} diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py index 9efebc8..9b8ec91 100644 --- a/benchmarks/__main__.py +++ b/benchmarks/__main__.py @@ -6,12 +6,13 @@ from rich.progress import track -from benchmarks import benchmark_diffusers, benchmark_tensorrt +from benchmarks import benchmark_diffusers, benchmark_oneflow, benchmark_tensorrt from benchmarks.settings import BenchmarkSettings, InputParameters ALL_BENCHMARKS = [ *benchmark_diffusers.LOCAL_BENCHMARKS, *benchmark_tensorrt.LOCAL_BENCHMARKS, + *benchmark_oneflow.LOCAL_BENCHMARKS, ] @@ -76,7 +77,8 @@ def main() -> None: previous_timings = load_previous_timings(session_file, settings, parameters) for benchmark in track(ALL_BENCHMARKS, description="Running benchmarks..."): benchmark_key = (benchmark["category"], benchmark["name"]) - if benchmark_key in previous_timings and not options.force_run: + should_skip = benchmark.get("skip_if", False) + if benchmark_key in previous_timings and (not options.force_run or should_skip): print(f"Skipping {benchmark_key} (already run)") timings.append( { diff --git a/benchmarks/benchmark_oneflow.py b/benchmarks/benchmark_oneflow.py new file mode 100644 index 0000000..3de15d0 --- /dev/null +++ b/benchmarks/benchmark_oneflow.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import sys +from functools import partial + +import fal + +from benchmarks.settings import BenchmarkResults, BenchmarkSettings, InputParameters + + +@fal.function( + requirements=[ + "--pre", + "torch==2.1.0", + "transformers==4.27.1", + "diffusers[torch]==0.19.3", + "onediff", + "oneflow", + "-f", + "https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu118", + ], + machine_type="GPU", + python_version="3.10", +) +def oneflow_any( + benchmark_settings: BenchmarkSettings, + parameters: InputParameters, + model_name: str, +) -> BenchmarkResults: + import oneflow as flow + import torch + from diffusers import DiffusionPipeline + from onediff.infer_compiler import oneflow_compile + + pipeline = DiffusionPipeline.from_pretrained( + model_name, + torch_dtype=torch.float16, + use_safetensors=True, + ) + pipeline.to("cuda") + pipeline.unet = oneflow_compile(pipeline.unet) + + with flow.autocast("cuda"): + infer_func = partial( + pipeline, parameters.prompt, num_inference_steps=parameters.steps + ) + return benchmark_settings.apply(infer_func) + + +# Since OneFlow doesn't support Python>3.10 (which we actively use), we are +# skipping these benchmarks in our continuous integration suite for now. If you +# make a change, be sure to run them manually on your computer and commit the +# results as artifacts. + +LOCAL_BENCHMARKS = [ + { + "name": "OneFlow", + "category": "SD1.5", + "function": oneflow_any, + "kwargs": { + "model_name": "runwayml/stable-diffusion-v1-5", + }, + "skip_if": sys.version_info > (3, 10), + }, + { + "name": "OneFlow", + "category": "SDXL", + "function": oneflow_any, + "kwargs": { + "model_name": "stabilityai/stable-diffusion-xl-base-1.0", + }, + "skip_if": sys.version_info > (3, 10), + }, +] diff --git a/benchmarks/benchmark_tensorrt.py b/benchmarks/benchmark_tensorrt.py index 2f908ec..a336247 100644 --- a/benchmarks/benchmark_tensorrt.py +++ b/benchmarks/benchmark_tensorrt.py @@ -65,10 +65,6 @@ def prepare_tensorrt() -> Path: "https://pypi.ngc.nvidia.com", ], machine_type="GPU", - _scheduler="nomad", - _scheduler_options={ - "target_node": "65.21.219.34", - }, ) def tensorrt_any( benchmark_settings: BenchmarkSettings,