diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
index 3879e25386d53..0708d57f040f8 100755
--- a/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
+++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/benchmark.py
@@ -51,6 +51,10 @@ def example_prompts():
     return prompts, negative_prompt
 
 
+def warmup_prompts():
+    return "warm up", "bad"
+
+
 def measure_gpu_memory(monitor_type, func, start_memory=None):
     return measure_memory(is_gpu=True, func=func, monitor_type=monitor_type, start_memory=start_memory)
 
@@ -136,7 +140,14 @@ def run_ort_pipeline(
     prompts, negative_prompt = example_prompts()
 
     def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )
 
     # Run warm up, and measure GPU memory of two runs
     # cuDNN/MIOpen The first run has  algo search so it might need more memory)
@@ -149,22 +160,20 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            images = pipe(
-                [prompt] * batch_size,
-                height,
-                width,
-                num_inference_steps=steps,
-                negative_prompt=[negative_prompt] * batch_size,
-                guidance_scale=7.5,
-            ).images
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+        ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     from onnxruntime import __version__ as ort_version
 
@@ -200,7 +209,14 @@ def run_torch_pipeline(
 
     # total 2 runs of warm up, and measure GPU memory for CUDA EP
     def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative] * batch_size,
+        )
 
     # Run warm up, and measure GPU memory of two runs (The first run has cuDNN algo search so it might need more memory)
     first_run_memory = measure_gpu_memory(memory_monitor_type, warmup, start_memory)
@@ -215,25 +231,23 @@ def warmup():
         if i >= num_prompts:
             break
         torch.cuda.synchronize()
-        for j in range(batch_count):
-            inference_start = time.time()
-            images = pipe(
-                prompt=[prompt] * batch_size,
-                height=height,
-                width=width,
-                num_inference_steps=steps,
-                guidance_scale=7.5,
-                negative_prompt=[negative_prompt] * batch_size,
-                generator=None,  # torch.Generator
-            ).images
+        inference_start = time.time()
+        images = pipe(
+            prompt=[prompt] * batch_size,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            negative_prompt=[negative_prompt] * batch_size,
+            generator=None,  # torch.Generator
+        ).images
 
-            torch.cuda.synchronize()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        torch.cuda.synchronize()
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     return {
         "engine": "torch",
@@ -306,6 +320,7 @@ def get_optimum_ort_pipeline(
     directory: str,
     provider="CUDAExecutionProvider",
     disable_safety_checker: bool = True,
+    use_io_binding: bool = False,
 ):
     from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
 
@@ -321,7 +336,7 @@ def get_optimum_ort_pipeline(
             pipeline = ORTStableDiffusionPipeline.from_pretrained(
                 directory,
                 provider=provider,
-                use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+                use_io_binding=use_io_binding,
             )
     elif "xl" in model_name:
         pipeline = ORTStableDiffusionXLPipeline.from_pretrained(
@@ -337,7 +352,7 @@ def get_optimum_ort_pipeline(
             model_name,
             export=True,
             provider=provider,
-            use_io_binding=False,  # Not supported by Optimum version 1.17.1 at the time of verification.
+            use_io_binding=use_io_binding,
         )
         pipeline.save_pretrained(directory)
 
@@ -359,15 +374,33 @@ def run_optimum_ort_pipeline(
     batch_count,
     start_memory,
     memory_monitor_type,
+    use_num_images_per_prompt=False,
 ):
     from optimum.onnxruntime import ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline
 
     assert isinstance(pipe, (ORTStableDiffusionPipeline, ORTStableDiffusionXLPipeline))
 
-    prompts = example_prompts()
+    prompts, negative_prompt = example_prompts()
 
     def warmup():
-        pipe("warm up", height, width, num_inference_steps=steps, num_images_per_prompt=batch_size)
+        prompt, negative = warmup_prompts()
+        if use_num_images_per_prompt:
+            pipe(
+                prompt=prompt,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=negative,
+                num_images_per_prompt=batch_count,
+            )
+        else:
+            pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=[negative] * batch_size,
+            )
 
     # Run warm up, and measure GPU memory of two runs.
     # The first run has algo search for cuDNN/MIOpen, so it might need more memory.
@@ -380,23 +413,30 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
+        inference_start = time.time()
+        if use_num_images_per_prompt:
             images = pipe(
-                prompt,
-                height,
-                width,
+                prompt=prompt,
+                height=height,
+                width=width,
                 num_inference_steps=steps,
-                negative_prompt=None,
-                guidance_scale=0.0,  # 7.5
+                negative_prompt=negative_prompt,
                 num_images_per_prompt=batch_size,
             ).images
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"Inference took {latency:.3f} seconds")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        else:
+            images = pipe(
+                prompt=[prompt] * batch_size,
+                height=height,
+                width=width,
+                num_inference_steps=steps,
+                negative_prompt=[negative_prompt] * batch_size,
+            ).images
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"Inference took {latency:.3f} seconds")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     from onnxruntime import __version__ as ort_version
 
@@ -429,9 +469,12 @@ def run_optimum_ort(
     batch_count: int,
     start_memory,
     memory_monitor_type,
+    use_io_binding: bool = False,
 ):
     load_start = time.time()
-    pipe = get_optimum_ort_pipeline(model_name, directory, provider, disable_safety_checker)
+    pipe = get_optimum_ort_pipeline(
+        model_name, directory, provider, disable_safety_checker, use_io_binding=use_io_binding
+    )
     load_end = time.time()
     print(f"Model loading took {load_end - load_start} seconds")
 
@@ -530,9 +573,8 @@ def run_ort_trt_static(
     pipeline.load_resources(height, width, batch_size)
 
     def warmup():
-        pipeline.run(
-            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
-        )
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -548,24 +590,23 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            images, pipeline_time = pipeline.run(
-                [prompt] * batch_size,
-                [negative_prompt] * batch_size,
-                height,
-                width,
-                denoising_steps=steps,
-                guidance=7.5,
-                seed=123,
-            )
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            guidance=7.5,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     pipeline.teardown()
 
@@ -671,9 +712,8 @@ def run_tensorrt_static(
     pipeline.load_resources(height, width, batch_size)
 
     def warmup():
-        pipeline.run(
-            ["warm up"] * batch_size, ["negative"] * batch_size, height, width, denoising_steps=steps, warmup=True
-        )
+        prompt, negative = warmup_prompts()
+        pipeline.run([prompt] * batch_size, [negative] * batch_size, height, width, denoising_steps=steps)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -689,24 +729,22 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            images, pipeline_time = pipeline.run(
-                [prompt] * batch_size,
-                [negative_prompt] * batch_size,
-                height,
-                width,
-                denoising_steps=steps,
-                guidance=7.5,
-                seed=123,
-            )
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.jpg")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = pipeline.run(
+            [prompt] * batch_size,
+            [negative_prompt] * batch_size,
+            height,
+            width,
+            denoising_steps=steps,
+            seed=123,
+        )
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.jpg")
 
     pipeline.teardown()
 
@@ -828,7 +866,8 @@ def run_sd_xl_inference(prompt, negative_prompt, seed=None):
         )
 
     def warmup():
-        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -845,20 +884,15 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            if nvtx_profile:
-                cudart.cudaProfilerStart()
-            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
-            if nvtx_profile:
-                cudart.cudaProfilerStop()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                image.save(f"{image_filename_prefix}_{i}_{j}_{k}.png")
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            image.save(f"{image_filename_prefix}_{i}_{k}.png")
 
     pipeline.teardown()
 
@@ -911,8 +945,6 @@ def run_ort_trt_xl(
         opt_batch_size=batch_size,
     )
 
-    from cuda import cudart
-
     assert batch_size <= max_batch_size
 
     pipeline.load_resources(height, width, batch_size)
@@ -929,7 +961,8 @@ def run_sd_xl_inference(prompt, negative_prompt, seed=None):
         )
 
     def warmup():
-        run_sd_xl_inference(["warm up"] * batch_size, ["negative"] * batch_size)
+        prompt, negative = warmup_prompts()
+        run_sd_xl_inference([prompt] * batch_size, [negative] * batch_size)
 
     # Run warm up, and measure GPU memory of two runs
     # The first run has algo search so it might need more memory
@@ -946,22 +979,17 @@ def warmup():
     for i, prompt in enumerate(prompts):
         if i >= num_prompts:
             break
-        for j in range(batch_count):
-            inference_start = time.time()
-            # Use warmup mode here since non-warmup mode will save image to disk.
-            if nvtx_profile:
-                cudart.cudaProfilerStart()
-            images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
-            if nvtx_profile:
-                cudart.cudaProfilerStop()
-            inference_end = time.time()
-            latency = inference_end - inference_start
-            latency_list.append(latency)
-            print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
-            for k, image in enumerate(images):
-                filename = f"{image_filename_prefix}_{i}_{j}_{k}.png"
-                image.save(filename)
-                print("Image saved to", filename)
+        inference_start = time.time()
+        # Use warmup mode here since non-warmup mode will save image to disk.
+        images, pipeline_time = run_sd_xl_inference([prompt] * batch_size, [negative_prompt] * batch_size, seed=123)
+        inference_end = time.time()
+        latency = inference_end - inference_start
+        latency_list.append(latency)
+        print(f"End2End took {latency:.3f} seconds. Inference latency: {pipeline_time}")
+        for k, image in enumerate(images):
+            filename = f"{image_filename_prefix}_{i}_{k}.png"
+            image.save(filename)
+            print("Image saved to", filename)
 
     pipeline.teardown()
 
@@ -1137,6 +1165,14 @@ def parse_arguments():
     )
     parser.set_defaults(use_xformers=False)
 
+    parser.add_argument(
+        "--use_io_binding",
+        required=False,
+        action="store_true",
+        help="Use I/O Binding for Optimum.",
+    )
+    parser.set_defaults(use_io_binding=False)
+
     parser.add_argument(
         "-b",
         "--batch_size",
@@ -1176,8 +1212,8 @@ def parse_arguments():
         "--num_prompts",
         required=False,
         type=int,
-        default=1,
-        help="Number of prompts. Default is 1.",
+        default=10,
+        help="Number of prompts. Default is 10.",
     )
 
     parser.add_argument(
@@ -1312,6 +1348,7 @@ def main():
             batch_count=args.batch_count,
             start_memory=start_memory,
             memory_monitor_type=memory_monitor_type,
+            use_io_binding=args.use_io_binding,
         )
     elif args.engine == "onnxruntime":
         assert args.pipeline and os.path.isdir(