benchmarks: add oneflow

fal-ai · Nov 4, 2023 · 57c13d5 · 57c13d5
1 parent 501e494
commit 57c13d5
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai).
 | Diffusers (fp16, SDPA) |   1.591s |     1.590s |  1.581s |  1.601s |   31.44 it/s |
 | Diffusers (fp16, SDPA, compiled) |   1.352s |     1.351s |  1.348s |  1.356s |   37.01 it/s |
 | Diffusers (fp16, SDPA, compiled, NCHW channels last) |   1.066s |     1.065s |  1.062s |  1.076s |   46.95 it/s |
+| OneFlow          |   0.951s |     0.953s |  0.941s |  0.957s |   52.48 it/s |
 | TensorRT 9.0 (cuda graphs, static shapes) |   0.819s |     0.818s |  0.817s |  0.821s |   61.14 it/s |
 
 ### SDXL Benchmarks
@@ -24,6 +25,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai).
 | Diffusers (fp16, xformers) |   5.724s |     5.724s |  5.714s |  5.731s |    8.74 it/s |
 | Diffusers (fp16, SDPA, compiled) |   5.246s |     5.247s |  5.233s |  5.259s |    9.53 it/s |
 | Diffusers (fp16, SDPA, compiled, NCHW channels last) |   5.132s |     5.132s |  5.121s |  5.142s |    9.74 it/s |
+| OneFlow          |   4.605s |     4.607s |  4.581s |  4.625s |   10.85 it/s |
 | TensorRT 9.0 (cuda graphs, static shapes) |   4.102s |     4.104s |  4.091s |  4.107s |   12.18 it/s |
 
 <!-- END TABLE -->

diff --git a/artifacts/latest.json b/artifacts/latest.json
@@ -1 +1 @@
-{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}]}
+{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}, {"name": "OneFlow", "category": "SD1.5", "timings": [0.9568120219919365, 0.9468847009993624, 0.9545126229932066, 0.9472718389879446, 0.9552929110068362, 0.9412291230109986, 0.9544001989997923, 0.9557115529896691, 0.9509655799774919, 0.9482630330021493]}, {"name": "OneFlow", "category": "SDXL", "timings": [4.586631373997079, 4.61347366499831, 4.600411992985755, 4.6092570440087, 4.611457958992105, 4.604861573025119, 4.602566407003906, 4.624956093000947, 4.615925558988238, 4.580915530998027]}]}
diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py
@@ -6,12 +6,13 @@
 
 from rich.progress import track
 
-from benchmarks import benchmark_diffusers, benchmark_tensorrt
+from benchmarks import benchmark_diffusers, benchmark_oneflow, benchmark_tensorrt
 from benchmarks.settings import BenchmarkSettings, InputParameters
 
 ALL_BENCHMARKS = [
     *benchmark_diffusers.LOCAL_BENCHMARKS,
     *benchmark_tensorrt.LOCAL_BENCHMARKS,
+    *benchmark_oneflow.LOCAL_BENCHMARKS,
 ]
 
 
@@ -76,7 +77,8 @@ def main() -> None:
     previous_timings = load_previous_timings(session_file, settings, parameters)
     for benchmark in track(ALL_BENCHMARKS, description="Running benchmarks..."):
         benchmark_key = (benchmark["category"], benchmark["name"])
-        if benchmark_key in previous_timings and not options.force_run:
+        should_skip = benchmark.get("skip_if", False)
+        if benchmark_key in previous_timings and (not options.force_run or should_skip):
             print(f"Skipping {benchmark_key} (already run)")
             timings.append(
                 {

diff --git a/benchmarks/benchmark_oneflow.py b/benchmarks/benchmark_oneflow.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import sys
+from functools import partial
+
+import fal
+
+from benchmarks.settings import BenchmarkResults, BenchmarkSettings, InputParameters
+
+
+@fal.function(
+    requirements=[
+        "--pre",
+        "torch==2.1.0",
+        "transformers==4.27.1",
+        "diffusers[torch]==0.19.3",
+        "onediff",
+        "oneflow",
+        "-f",
+        "https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu118",
+    ],
+    machine_type="GPU",
+    python_version="3.10",
+)
+def oneflow_any(
+    benchmark_settings: BenchmarkSettings,
+    parameters: InputParameters,
+    model_name: str,
+) -> BenchmarkResults:
+    import oneflow as flow
+    import torch
+    from diffusers import DiffusionPipeline
+    from onediff.infer_compiler import oneflow_compile
+
+    pipeline = DiffusionPipeline.from_pretrained(
+        model_name,
+        torch_dtype=torch.float16,
+        use_safetensors=True,
+    )
+    pipeline.to("cuda")
+    pipeline.unet = oneflow_compile(pipeline.unet)
+
+    with flow.autocast("cuda"):
+        infer_func = partial(
+            pipeline, parameters.prompt, num_inference_steps=parameters.steps
+        )
+        return benchmark_settings.apply(infer_func)
+
+
+# Since OneFlow doesn't support Python>3.10 (which we actively use), we are
+# skipping these benchmarks in our continuous integration suite for now. If you
+# make a change, be sure to run them manually on your computer and commit the
+# results as artifacts.
+
+LOCAL_BENCHMARKS = [
+    {
+        "name": "OneFlow",
+        "category": "SD1.5",
+        "function": oneflow_any,
+        "kwargs": {
+            "model_name": "runwayml/stable-diffusion-v1-5",
+        },
+        "skip_if": sys.version_info > (3, 10),
+    },
+    {
+        "name": "OneFlow",
+        "category": "SDXL",
+        "function": oneflow_any,
+        "kwargs": {
+            "model_name": "stabilityai/stable-diffusion-xl-base-1.0",
+        },
+        "skip_if": sys.version_info > (3, 10),
+    },
+]
diff --git a/benchmarks/benchmark_tensorrt.py b/benchmarks/benchmark_tensorrt.py
@@ -65,10 +65,6 @@ def prepare_tensorrt() -> Path:
         "https://pypi.ngc.nvidia.com",
     ],
     machine_type="GPU",
-    _scheduler="nomad",
-    _scheduler_options={
-        "target_node": "65.21.219.34",
-    },
 )
 def tensorrt_any(
     benchmark_settings: BenchmarkSettings,
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}]}
		{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}, {"name": "OneFlow", "category": "SD1.5", "timings": [0.9568120219919365, 0.9468847009993624, 0.9545126229932066, 0.9472718389879446, 0.9552929110068362, 0.9412291230109986, 0.9544001989997923, 0.9557115529896691, 0.9509655799774919, 0.9482630330021493]}, {"name": "OneFlow", "category": "SDXL", "timings": [4.586631373997079, 4.61347366499831, 4.600411992985755, 4.6092570440087, 4.611457958992105, 4.604861573025119, 4.602566407003906, 4.624956093000947, 4.615925558988238, 4.580915530998027]}]}