Skip to content

Commit

Permalink
benchmarks: add oneflow
Browse files Browse the repository at this point in the history
  • Loading branch information
isidentical committed Nov 4, 2023
1 parent 501e494 commit 57c13d5
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 7 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai).
| Diffusers (fp16, SDPA) | 1.591s | 1.590s | 1.581s | 1.601s | 31.44 it/s |
| Diffusers (fp16, SDPA, compiled) | 1.352s | 1.351s | 1.348s | 1.356s | 37.01 it/s |
| Diffusers (fp16, SDPA, compiled, NCHW channels last) | 1.066s | 1.065s | 1.062s | 1.076s | 46.95 it/s |
| OneFlow | 0.951s | 0.953s | 0.941s | 0.957s | 52.48 it/s |
| TensorRT 9.0 (cuda graphs, static shapes) | 0.819s | 0.818s | 0.817s | 0.821s | 61.14 it/s |

### SDXL Benchmarks
Expand All @@ -24,6 +25,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai).
| Diffusers (fp16, xformers) | 5.724s | 5.724s | 5.714s | 5.731s | 8.74 it/s |
| Diffusers (fp16, SDPA, compiled) | 5.246s | 5.247s | 5.233s | 5.259s | 9.53 it/s |
| Diffusers (fp16, SDPA, compiled, NCHW channels last) | 5.132s | 5.132s | 5.121s | 5.142s | 9.74 it/s |
| OneFlow | 4.605s | 4.607s | 4.581s | 4.625s | 10.85 it/s |
| TensorRT 9.0 (cuda graphs, static shapes) | 4.102s | 4.104s | 4.091s | 4.107s | 12.18 it/s |

<!-- END TABLE -->
Expand Down
2 changes: 1 addition & 1 deletion artifacts/latest.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}]}
{"settings": {"warmup_iterations": 3, "benchmark_iterations": 10}, "parameters": {"prompt": "A photo of a cat", "steps": 50}, "timings": [{"name": "Diffusers (fp16, SDPA)", "category": "SD1.5", "timings": [1.5917212970089167, 1.5975631090113893, 1.5821007050108165, 1.5864128279790748, 1.5813008210097905, 1.588955162995262, 1.583035584015306, 1.5979954930080567, 1.6009252599906176, 1.5956080609757919]}, {"name": "Diffusers (fp16, xformers)", "category": "SD1.5", "timings": [1.7560910189931747, 1.7572659730212763, 1.7597715989977587, 1.7469689899880905, 1.763645778002683, 1.748716948000947, 1.7602629070170224, 1.7721076029993128, 1.7460152900021058, 1.7701677379955072]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SD1.5", "timings": [1.356168844999047, 1.354804383998271, 1.3516721340129152, 1.3500280909938738, 1.3562533959920984, 1.3556265980005264, 1.3505920349853113, 1.3477569509996101, 1.3498703970108181, 1.3481854719866533]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SD1.5", "timings": [1.0672315989795607, 1.0727007249952294, 1.0632865040097386, 1.0763663580000866, 1.06514667099691, 1.065665372996591, 1.0638107580016367, 1.0616009290097281, 1.0649084030010272, 1.063036303006811]}, {"name": "Diffusers (fp16, SDPA)", "category": "SDXL", "timings": [5.940763157996116, 5.926704184006667, 5.932992869988084, 5.940833892993396, 5.923987179005053, 5.938259807007853, 5.923574882996036, 5.930732762994012, 5.942996845988091, 5.932096109987469]}, {"name": "Diffusers (fp16, xformers)", "category": "SDXL", "timings": [5.728389803000027, 5.7223248230002355, 5.713896728004329, 5.7198221340077, 5.716055455995956, 5.730836973001715, 5.725524671986932, 5.730034602980595, 5.726219657983165, 5.722418188001029]}, {"name": "Diffusers (fp16, SDPA, compiled)", "category": "SDXL", "timings": [5.233289741008775, 5.24713467201218, 5.235783365002135, 5.239803472999483, 5.251854731992353, 5.242411447019549, 5.250333832023898, 5.259196978004184, 5.247713554999791, 5.255097048007883]}, {"name": "Diffusers (fp16, SDPA, compiled, NCHW channels last)", "category": "SDXL", "timings": [5.12098954099929, 5.122773736016825, 5.130043459008448, 5.131235945009394, 5.132947302015964, 5.1301643929909915, 5.13384268200025, 5.141838512994582, 5.139718485996127, 5.141162898013135]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SD1.5", "timings": [0.819957683008397, 0.8171751589979976, 0.8198997500003316, 0.8168765410082415, 0.8175504659884609, 0.817866342025809, 0.8211427440110128, 0.8207452670030762, 0.8174457829736639, 0.8177875310066156]}, {"name": "TensorRT 9.0 (cuda graphs, static shapes)", "category": "SDXL", "timings": [4.099050192977302, 4.091173734981567, 4.09869981801603, 4.100261182000395, 4.1056046999874525, 4.1030455399886705, 4.104289636015892, 4.105645445990376, 4.1050181849859655, 4.106528664997313]}, {"name": "OneFlow", "category": "SD1.5", "timings": [0.9568120219919365, 0.9468847009993624, 0.9545126229932066, 0.9472718389879446, 0.9552929110068362, 0.9412291230109986, 0.9544001989997923, 0.9557115529896691, 0.9509655799774919, 0.9482630330021493]}, {"name": "OneFlow", "category": "SDXL", "timings": [4.586631373997079, 4.61347366499831, 4.600411992985755, 4.6092570440087, 4.611457958992105, 4.604861573025119, 4.602566407003906, 4.624956093000947, 4.615925558988238, 4.580915530998027]}]}
6 changes: 4 additions & 2 deletions benchmarks/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

from rich.progress import track

from benchmarks import benchmark_diffusers, benchmark_tensorrt
from benchmarks import benchmark_diffusers, benchmark_oneflow, benchmark_tensorrt
from benchmarks.settings import BenchmarkSettings, InputParameters

ALL_BENCHMARKS = [
*benchmark_diffusers.LOCAL_BENCHMARKS,
*benchmark_tensorrt.LOCAL_BENCHMARKS,
*benchmark_oneflow.LOCAL_BENCHMARKS,
]


Expand Down Expand Up @@ -76,7 +77,8 @@ def main() -> None:
previous_timings = load_previous_timings(session_file, settings, parameters)
for benchmark in track(ALL_BENCHMARKS, description="Running benchmarks..."):
benchmark_key = (benchmark["category"], benchmark["name"])
if benchmark_key in previous_timings and not options.force_run:
should_skip = benchmark.get("skip_if", False)
if benchmark_key in previous_timings and (not options.force_run or should_skip):
print(f"Skipping {benchmark_key} (already run)")
timings.append(
{
Expand Down
74 changes: 74 additions & 0 deletions benchmarks/benchmark_oneflow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

import sys
from functools import partial

import fal

from benchmarks.settings import BenchmarkResults, BenchmarkSettings, InputParameters


@fal.function(
requirements=[
"--pre",
"torch==2.1.0",
"transformers==4.27.1",
"diffusers[torch]==0.19.3",
"onediff",
"oneflow",
"-f",
"https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu118",
],
machine_type="GPU",
python_version="3.10",
)
def oneflow_any(
benchmark_settings: BenchmarkSettings,
parameters: InputParameters,
model_name: str,
) -> BenchmarkResults:
import oneflow as flow
import torch
from diffusers import DiffusionPipeline
from onediff.infer_compiler import oneflow_compile

pipeline = DiffusionPipeline.from_pretrained(
model_name,
torch_dtype=torch.float16,
use_safetensors=True,
)
pipeline.to("cuda")
pipeline.unet = oneflow_compile(pipeline.unet)

with flow.autocast("cuda"):
infer_func = partial(
pipeline, parameters.prompt, num_inference_steps=parameters.steps
)
return benchmark_settings.apply(infer_func)


# Since OneFlow doesn't support Python>3.10 (which we actively use), we are
# skipping these benchmarks in our continuous integration suite for now. If you
# make a change, be sure to run them manually on your computer and commit the
# results as artifacts.

LOCAL_BENCHMARKS = [
{
"name": "OneFlow",
"category": "SD1.5",
"function": oneflow_any,
"kwargs": {
"model_name": "runwayml/stable-diffusion-v1-5",
},
"skip_if": sys.version_info > (3, 10),
},
{
"name": "OneFlow",
"category": "SDXL",
"function": oneflow_any,
"kwargs": {
"model_name": "stabilityai/stable-diffusion-xl-base-1.0",
},
"skip_if": sys.version_info > (3, 10),
},
]
4 changes: 0 additions & 4 deletions benchmarks/benchmark_tensorrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ def prepare_tensorrt() -> Path:
"https://pypi.ngc.nvidia.com",
],
machine_type="GPU",
_scheduler="nomad",
_scheduler_options={
"target_node": "65.21.219.34",
},
)
def tensorrt_any(
benchmark_settings: BenchmarkSettings,
Expand Down

0 comments on commit 57c13d5

Please sign in to comment.