From 879ada1bc34c783c634d3ccd2bce7fdb5e6c1906 Mon Sep 17 00:00:00 2001 From: Batuhan Taskaya Date: Sat, 4 Nov 2023 21:49:42 +0300 Subject: [PATCH] benchmarks: add TensorRT --- README.md | 2 + benchmarks/__main__.py | 5 +- .../{diffusers.py => benchmark_diffusers.py} | 11 +- benchmarks/benchmark_tensorrt.py | 165 ++++++++++++++++++ 4 files changed, 175 insertions(+), 8 deletions(-) rename benchmarks/{diffusers.py => benchmark_diffusers.py} (95%) create mode 100644 benchmarks/benchmark_tensorrt.py diff --git a/README.md b/README.md index 0121b0f..7c1b0f4 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai). | Diffusers (fp16, xformers) | 1.758s | 1.759s | 1.746s | 1.772s | 28.43 it/s | | Diffusers (fp16, SDPA, compiled) | 1.352s | 1.351s | 1.348s | 1.356s | 37.01 it/s | | Diffusers (fp16, SDPA, compiled, NCHW channels last) | 1.066s | 1.065s | 1.062s | 1.076s | 46.95 it/s | +| TensorRT 9.0 (cuda graphs, static shapes) | 0.819s | 0.818s | 0.817s | 0.821s | 61.14 it/s | ### SDXL Benchmarks | | mean (s) | median (s) | min (s) | max (s) | speed (it/s) | @@ -23,6 +24,7 @@ Running on an A100 80G SXM hosted at [fal.ai](https://fal.ai). | Diffusers (fp16, xformers) | 5.724s | 5.724s | 5.714s | 5.731s | 8.74 it/s | | Diffusers (fp16, SDPA, compiled) | 5.246s | 5.247s | 5.233s | 5.259s | 9.53 it/s | | Diffusers (fp16, SDPA, compiled, NCHW channels last) | 5.132s | 5.132s | 5.121s | 5.142s | 9.74 it/s | +| TensorRT 9.0 (cuda graphs, static shapes) | 4.102s | 4.104s | 4.091s | 4.107s | 12.18 it/s | diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py index 803a38b..774e855 100644 --- a/benchmarks/__main__.py +++ b/benchmarks/__main__.py @@ -6,11 +6,12 @@ from rich.progress import track -from benchmarks import diffusers +from benchmarks import benchmark_diffusers, benchmark_tensorrt from benchmarks.settings import BenchmarkSettings, InputParameters ALL_BENCHMARKS = [ - *diffusers.LOCAL_BENCHMARKS, + *benchmark_diffusers.LOCAL_BENCHMARKS, + *benchmark_tensorrt.LOCAL_BENCHMARKS, ] diff --git a/benchmarks/diffusers.py b/benchmarks/benchmark_diffusers.py similarity index 95% rename from benchmarks/diffusers.py rename to benchmarks/benchmark_diffusers.py index ace0d21..e99a60f 100644 --- a/benchmarks/diffusers.py +++ b/benchmarks/benchmark_diffusers.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os from functools import partial @@ -57,13 +59,10 @@ def diffusers_any( pipeline.unet, fullgraph=True, mode="reduce-overhead" ) - return benchmark_settings.apply( - partial( - pipeline, - parameters.prompt, - num_inference_steps=parameters.steps, - ) + inference_func = partial( + pipeline, parameters.prompt, num_inference_steps=parameters.steps ) + return benchmark_settings.apply(inference_func) LOCAL_BENCHMARKS = [ diff --git a/benchmarks/benchmark_tensorrt.py b/benchmarks/benchmark_tensorrt.py new file mode 100644 index 0000000..2f908ec --- /dev/null +++ b/benchmarks/benchmark_tensorrt.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import contextlib +import shutil +import subprocess +import sys +from functools import partial +from pathlib import Path + +import fal + +from benchmarks.settings import BenchmarkResults, BenchmarkSettings, InputParameters + +DATA_DIR = Path("/data/tensorrt") +REPO_DIR = DATA_DIR / "repo" + + +def prepare_tensorrt() -> Path: + DATA_DIR.mkdir(exist_ok=True) + + if not REPO_DIR.exists(): + try: + subprocess.check_call( + [ + "git", + "clone", + "https://github.com/rajeevsrao/TensorRT", + "--branch", + "release/9.0", + "--single-branch", + str(REPO_DIR), + ] + ) + except subprocess.CalledProcessError: + print("Failed to clone TensorRT repo") + shutil.rmtree(REPO_DIR) + raise + + return REPO_DIR + + +@fal.function( + # Copied from https://github.com/rajeevsrao/TensorRT/blob/release/9.0/demo/Diffusion/requirements.txt + requirements=[ + "--pre", + "accelerate==0.24.1", + "colored", + "controlnet_aux==0.0.6", + "cuda-python", + "diffusers==0.19.3", + "ftfy", + "matplotlib", + "nvtx", + "onnx-graphsurgeon", + "onnx==1.14.0", + "onnxruntime==1.15.1", + "polygraphy==0.47.1", + "scipy", + "tensorrt==9.0.1.post12.dev4", + "torch==2.1", + "transformers==4.31.0", + "--extra-index-url", + "https://pypi.nvidia.com", + "--extra-index-url", + "https://pypi.ngc.nvidia.com", + ], + machine_type="GPU", + _scheduler="nomad", + _scheduler_options={ + "target_node": "65.21.219.34", + }, +) +def tensorrt_any( + benchmark_settings: BenchmarkSettings, + parameters: InputParameters, + model_version: str, + image_height: int, + image_width: int, +) -> BenchmarkResults: + trt_path = prepare_tensorrt() + diffusion_dir = trt_path / "demo" / "Diffusion" + if str(diffusion_dir) not in sys.path: + sys.path.insert(0, str(diffusion_dir)) + + with contextlib.chdir(diffusion_dir): + from cuda import cudart + from stable_diffusion_pipeline import StableDiffusionPipeline + from utilities import PIPELINE_TYPE + + # Initialize demo + options = { + "version": model_version, + "denoising_steps": parameters.steps, + "use_cuda_graph": True, + "max_batch_size": 4, + "output_dir": "output", + } + + if model_version == "1.5": + options["pipeline_type"] = PIPELINE_TYPE.TXT2IMG + elif model_version == "xl-1.0": + options["pipeline_type"] = PIPELINE_TYPE.XL_BASE + options["vae_scaling_factor"] = 0.13025 + else: + raise ValueError(f"Unknown model version: {model_version}") + + pipeline = StableDiffusionPipeline(**options) + pipeline.loadEngines( + engine_dir=f"engine-{model_version}", + framework_model_dir="pytorch_model", + onnx_dir=f"onnx-{model_version}", + onnx_opset=18, + opt_batch_size=1, + opt_image_height=image_height, + opt_image_width=image_width, + enable_all_tactics=False, + enable_refit=False, + force_build=False, + force_export=False, + force_optimize=False, + static_batch=True, + static_shape=True, + timing_cache=f"cache-{model_version}", + ) + + # Load resources + _, shared_device_memory = cudart.cudaMalloc(pipeline.calculateMaxDeviceMemory()) + pipeline.activateEngines(shared_device_memory) + pipeline.loadResources(image_height, image_width, 1, seed=0) + inference_func = partial( + pipeline.infer, + [parameters.prompt], + [""], + image_height=image_height, + image_width=image_width, + save_image=False, + ) + results = benchmark_settings.apply(inference_func) + pipeline.teardown() + + return results + + +LOCAL_BENCHMARKS = [ + { + "name": "TensorRT 9.0 (cuda graphs, static shapes)", + "category": "SD1.5", + "function": tensorrt_any, + "kwargs": { + "model_version": "1.5", + "image_height": 512, + "image_width": 512, + }, + }, + { + "name": "TensorRT 9.0 (cuda graphs, static shapes)", + "category": "SDXL", + "function": tensorrt_any, + "kwargs": { + "model_version": "xl-1.0", + "image_height": 512, + "image_width": 512, + }, + }, +]