From 72186bbb71dfa34e04afd2294885fde2acc5bf55 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 6 Nov 2024 09:54:55 -0800
Subject: [PATCH] [CUDA] Build nhwc ops by default (#22648)

### Description

* Build cuda nhwc ops by default.
* Deprecate `--enable_cuda_nhwc_ops` in build.py and add
`--disable_cuda_nhwc_ops` option

Note that it requires cuDNN 9.x. If you build with cuDNN 8, NHWC ops
will be disabled automatically.

### Motivation and Context

In general, NHWC is faster than NCHW for convolution in Nvidia GPUs with
Tensor Cores, and this could improve performance for vision models.

This is the first step to prefer NHWC for CUDA in 1.21 release. Next
step is to do some tests on popular vision models. If it help in most
models and devices, set `prefer_nhwc=1` as default cuda provider option.
---
 cmake/CMakeLists.txt                          |  2 +-
 dockerfiles/Dockerfile.cuda                   |  1 -
 docs/OperatorKernels.md                       | 29 ++++++++++++
 .../models/sam2/benchmark_sam2.sh             |  1 -
 .../test/providers/cpu/nn/conv_op_test.cc     | 21 +++++++--
 tools/ci_build/build.py                       | 15 ++++++-
 .../azure-pipelines/bigmodels-ci-pipeline.yml |  2 +-
 tools/ci_build/github/linux/build_cuda_ci.sh  | 45 ++++++++++---------
 8 files changed, 85 insertions(+), 31 deletions(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8d9f08cee05e7..31ebf58b03152 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -86,7 +86,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
-option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
+cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
diff --git a/dockerfiles/Dockerfile.cuda b/dockerfiles/Dockerfile.cuda
index d2d656648f2e7..ce4560e9b0c7c 100644
--- a/dockerfiles/Dockerfile.cuda
+++ b/dockerfiles/Dockerfile.cuda
@@ -56,7 +56,6 @@ RUN cd /code \
     --build_shared_lib --skip_tests \
     --config Release --build_wheel --update --build --parallel \
     --cmake_generator Ninja \
-    --enable_cuda_nhwc_ops \
     --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" onnxruntime_BUILD_UNIT_TESTS=OFF
 
 # Start second stage to copy the build artifacts
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 5fb1e54b38c2b..e23a52757dedb 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -925,6 +925,35 @@ Do not modify directly.*
 |WhisperBeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *in* cross_qk_layer_head:**I**<br> *in* extra_decoding_ids:**I**<br> *in* temperature:**T**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**<br> *out* cross_qk:**V**<br> *out* non_speech_probs:**T**|1+|**T** = tensor(float), tensor(float16)|
 | |
 | |
+|**Operator Domain:** *com.ms.internal.nhwc*||||
+|AveragePool|*in* X:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||10|**T** = tensor(float), tensor(float16)|
+|||[7, 9]|**T** = tensor(float), tensor(float16)|
+|BatchNormalization|*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* input_mean:**U**<br> *in* input_var:**U**<br> *out* Y:**T**<br> *out* running_mean:**U**<br> *out* running_var:**U**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T**<br> *in* B:**T**<br> *in* mean:**T**<br> *in* var:**T**<br> *out* Y:**T**<br> *out* mean:**T**<br> *out* var:**T**<br> *out* saved_mean:**T**<br> *out* saved_var:**T**<br><br>or<br><br>*in* X:**T**<br> *in* scale:**T1**<br> *in* B:**T1**<br> *in* input_mean:**T2**<br> *in* input_var:**T2**<br> *out* Y:**T**<br> *out* running_mean:**T2**<br> *out* running_var:**T2**|15+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
+|||14|**T** = tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float), tensor(float16)|
+|||[9, 13]|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
+|Conv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||[1, 10]|**T** = tensor(float), tensor(float16)|
+|ConvTranspose|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|11+|**T** = tensor(float), tensor(float16)|
+|||[1, 10]|**T** = tensor(float), tensor(float16)|
+|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
+|GlobalAveragePool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|GlobalMaxPool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
+|MaxPool|*in* X:**T**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**<br> *out* Indices:**I**|12+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|
+|||11|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
+|||10|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
+|||[8, 9]|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16)|
+|||[1, 7]|**T** = tensor(float), tensor(float16)|
+|SpaceToDepth|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
+|||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
+| |
+| |
 
 
 <a name="dmlexecutionprovider"/>
diff --git a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
index e6da988f5c0df..9e97867657ab9 100644
--- a/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
+++ b/onnxruntime/python/tools/transformers/models/sam2/benchmark_sam2.sh
@@ -191,7 +191,6 @@ build_onnxruntime_gpu_for_profiling() {
                 --build_wheel --skip_tests \
                 --cmake_generator Ninja \
                 --compile_no_warning_as_error \
-                --enable_cuda_nhwc_ops \
                 --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=$CUDA_ARCH \
                 --cmake_extra_defines onnxruntime_ENABLE_NVTX_PROFILE=ON \
                 --enable_cuda_line_info
diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
index 25caa732efa25..a3a3dd939cbf0 100644
--- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#include "core/graph/constants.h"
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
+
 using namespace std;
 namespace onnxruntime {
 namespace test {
@@ -28,7 +29,8 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
                 optional<float> epsilon = optional<float>(),
                 OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
                 const std::string& err_str = "",
-                int opset = 7) {
+                int opset = 7,
+                bool exclude_cuda_nhwc = false) {
   OpTester test("Conv", opset);
   test.AddAttribute("group", attributes.group);
   test.AddAttribute("kernel_shape", attributes.kernel_shape);
@@ -65,6 +67,12 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes,
   // Disable TensorRT because weight as input is not supported
   excluded_providers.insert(kTensorrtExecutionProvider);
 
+  if (exclude_cuda_nhwc) {
+#ifdef ENABLE_CUDA_NHWC_OPS
+    excluded_providers.insert(kCudaNHWCExecutionProvider);
+#endif
+  }
+
   // QNN SDK 2.10.0 has a bug that breaks support for dynamic bias inputs.
   excluded_providers.insert(kQnnExecutionProvider);
 
@@ -197,10 +205,15 @@ TEST(ConvTest, Conv1D_Bias) {
   // as TF32 has a 10 bit mantissa.
   float epsilon = 1.1e-5f;
 
-  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon);
+  // This case is not supported by cuDNN frontend, and the fallback (legacy code) requires weight to 4D tensor for NHWC.
+  constexpr bool exclude_cuda_nhwc = true;
+
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, false, epsilon,
+             OpTester::ExpectResult::kExpectSuccess, "", 10, exclude_cuda_nhwc);
 
   // CoreML EP requires weight to be an initializer
-  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon);
+  TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true, epsilon,
+             OpTester::ExpectResult::kExpectSuccess, "", 10, exclude_cuda_nhwc);
 }
 
 // Conv47
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 5cc040fd70b09..24dc6124d4a89 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -13,6 +13,7 @@
 import shutil
 import subprocess
 import sys
+import warnings
 from pathlib import Path
 
 
@@ -253,7 +254,12 @@ def convert_arg_line_to_args(self, arg_line):
         "--cudnn_home is not specified.",
     )
     parser.add_argument("--enable_cuda_line_info", action="store_true", help="Enable CUDA line info.")
-    parser.add_argument("--enable_cuda_nhwc_ops", action="store_true", help="Enable CUDA NHWC ops in build.")
+
+    parser.add_argument(
+        "--enable_cuda_nhwc_ops", action="store_true", help="Deprecated; default to enable CUDA NHWC ops in build."
+    )
+
+    parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.")
 
     # Python bindings
     parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.")
@@ -793,6 +799,11 @@ def convert_arg_line_to_args(self, arg_line):
     if args.cmake_generator is None and is_windows():
         args.cmake_generator = "Ninja" if args.build_wasm else "Visual Studio 17 2022"
 
+    if args.enable_cuda_nhwc_ops:
+        warnings.warn(
+            "The argument '--enable_cuda_nhwc_ops' is deprecated and is default to True. ", DeprecationWarning
+        )
+
     return args
 
 
@@ -1074,7 +1085,7 @@ def generate_build_tree(
         "-Donnxruntime_USE_MPI=" + ("ON" if args.use_mpi else "OFF"),
         "-Donnxruntime_ENABLE_MEMORY_PROFILE=" + ("ON" if args.enable_memory_profile else "OFF"),
         "-Donnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO=" + ("ON" if args.enable_cuda_line_info else "OFF"),
-        "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.enable_cuda_nhwc_ops else "OFF"),
+        "-Donnxruntime_USE_CUDA_NHWC_OPS=" + ("ON" if args.use_cuda and not args.disable_cuda_nhwc_ops else "OFF"),
         "-Donnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB=" + ("ON" if args.build_wasm_static_lib else "OFF"),
         "-Donnxruntime_ENABLE_WEBASSEMBLY_EXCEPTION_CATCHING="
         + ("OFF" if args.disable_wasm_exception_catching else "ON"),
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 5af95d345794d..0da1f8fc8d540 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -123,7 +123,7 @@ stages:
                 --parallel \
                 --build_wheel \
                 --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
-                --enable_cuda_profiling --enable_cuda_nhwc_ops \
+                --enable_cuda_profiling \
                 --enable_pybind --build_java \
                 --use_cache \
                 --cmake_extra_defines  'CMAKE_CUDA_ARCHITECTURES=75;86' ; \
diff --git a/tools/ci_build/github/linux/build_cuda_ci.sh b/tools/ci_build/github/linux/build_cuda_ci.sh
index a78e240998350..0533b7b394492 100755
--- a/tools/ci_build/github/linux/build_cuda_ci.sh
+++ b/tools/ci_build/github/linux/build_cuda_ci.sh
@@ -3,28 +3,31 @@ set -ex
 #Every cuda container has this $CUDA_VERSION env var set.
 SHORT_CUDA_VERSION=$(echo $CUDA_VERSION | sed   's/\([[:digit:]]\+\.[[:digit:]]\+\)\.[[:digit:]]\+/\1/')
 
-BUILD_ARGS=('--config' 'Release' '--update' '--build'
-              '--skip_submodule_sync'
-              '--build_shared_lib'
-              '--parallel' '--use_binskim_compliant_compile_flags'
-              '--build_wheel'
-              '--enable_onnx_tests'
-	      '--use_cuda'
-	      "--cuda_version=$SHORT_CUDA_VERSION"
-	      "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
-	      "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
-              "--enable_cuda_profiling"
-	      "--enable_cuda_nhwc_ops"
-              "--enable_pybind"
-	      "--build_java"
-              "--cmake_extra_defines"
-	      "CMAKE_CUDA_ARCHITECTURES=75"
-	      "onnxruntime_BUILD_UNIT_TESTS=ON"
-	      "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
+BUILD_ARGS=('--config'
+            'Release'
+            '--update'
+            '--build'
+            '--skip_submodule_sync'
+            '--build_shared_lib'
+            '--parallel'
+            '--use_binskim_compliant_compile_flags'
+            '--build_wheel'
+            '--enable_onnx_tests'
+            '--use_cuda'
+            "--cuda_version=$SHORT_CUDA_VERSION"
+            "--cuda_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
+            "--cudnn_home=/usr/local/cuda-$SHORT_CUDA_VERSION"
+            "--enable_cuda_profiling"
+            "--enable_pybind"
+            "--build_java"
+            "--cmake_extra_defines"
+            "CMAKE_CUDA_ARCHITECTURES=75"
+            "onnxruntime_BUILD_UNIT_TESTS=ON"
+            "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON")
 if [ -x "$(command -v ninja)" ]; then
     BUILD_ARGS+=('--cmake_generator' 'Ninja')
 fi
-    
+
 if [ -d /build ]; then
     BUILD_ARGS+=('--build_dir' '/build')
 else
@@ -40,7 +43,7 @@ if [ -f /opt/python/cp312-cp312/bin/python3 ]; then
 else
     python3 tools/ci_build/build.py "${BUILD_ARGS[@]}"
 fi
-if [ -x "$(command -v ccache)" ]; then				  
-    ccache -sv 
+if [ -x "$(command -v ccache)" ]; then
+    ccache -sv
     ccache -z
 fi