microsoft · hariharans29 · Jan 29, 2026 · Jan 29, 2026 · Jan 29, 2026 · Jan 31, 2026
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -747,7 +747,9 @@ Do not modify directly.*
 |||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
 |GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
 |||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
-|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|22+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|||[20, 21]|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|||[16, 19]|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |HardSigmoid|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[6, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
 |HardSwish|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
@@ -1062,7 +1064,9 @@ Do not modify directly.*
 |||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
 |GlobalAveragePool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |GlobalMaxPool|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
-|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|22+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|||[20, 21]|**T1** = tensor(float)<br/> **T2** = tensor(float)|
+|||[16, 19]|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |MaxPool|*in* X:**T**<br> *out* Y:**T**<br><br>or<br><br>*in* X:**T**<br> *out* Y:**T**<br> *out* Indices:**I**|12+|**I** = tensor(int64)<br/> **T** = tensor(float), tensor(float16), tensor(int8), tensor(uint8)|

diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -748,7 +748,7 @@ std::vector<int64_t> ChannelLastToFirstPerm(size_t rank) {
   }
 
   std::vector<int64_t> p(rank);
-  p[0] = 0;
+  p[0] = 0;  // This is usually the batch dimension (hence preserve this position)
   p[1] = rank - 1;
   for (size_t i = 2; i < rank; ++i) {
     p[i] = i - 1;

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1421,7 +1421,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterND);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 19, float, GridSample);
 
 // Opset 17
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
@@ -1510,6 +1510,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, 21, float, GridSample);
 
 // Opset 21.
 // TODO(fajin): support other quantized types
@@ -1583,6 +1584,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, HardSwish);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, GridSample);
 
 // Opset 23.
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float, Attention);
@@ -2485,7 +2487,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterND)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 19, float, GridSample)>,
 
       // Opset 17
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
@@ -2582,6 +2584,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, 21, float, GridSample)>,
 
       // Opset 21
       // TODO(fajin): support other quantized types
@@ -2654,6 +2657,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, HardSwish)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, GridSample)>,
 
       // Opset 23
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 23, float, Attention)>,

diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc
@@ -164,12 +164,17 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) {
 #ifndef DISABLE_CONTRIB_OPS
 namespace onnxruntime::contrib::cuda {
 
-class CUDA_NHWC_OP_TYPED_CLASS_NAME(16, float, GridSample);
+class CUDA_NHWC_OP_VERSIONED_TYPED_CLASS_NAME(16, 19, float, GridSample);
+class CUDA_NHWC_OP_VERSIONED_TYPED_CLASS_NAME(20, 21, float, GridSample);
+class CUDA_NHWC_OP_TYPED_CLASS_NAME(22, float, GridSample);
 
 onnxruntime::common::Status RegisterCudaNhwcContribKernels(onnxruntime::KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn nhwc_function_table[] = {
       BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
-      BuildKernelCreateInfo<CUDA_NHWC_OP_TYPED_CLASS_NAME(16, float, GridSample)>,
+      BuildKernelCreateInfo<CUDA_NHWC_OP_VERSIONED_TYPED_CLASS_NAME(16, 19, float, GridSample)>,
+      BuildKernelCreateInfo<CUDA_NHWC_OP_VERSIONED_TYPED_CLASS_NAME(20, 21, float, GridSample)>,
+      BuildKernelCreateInfo<CUDA_NHWC_OP_TYPED_CLASS_NAME(22, float, GridSample)>,
+
   };
 
   for (auto& function_table_entry : nhwc_function_table) {

diff --git a/onnxruntime/core/providers/cuda/tensor/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc
@@ -21,28 +21,66 @@
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
       onnxruntime::contrib::cuda::GridSample<T, LAYOUT>);
 
+#define REGISTER_KERNEL_VERSIONED_TYPED(T, FROM_VERSION, TO_VERSION, LAYOUT, DOMAIN) \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                           \
+      GridSample,                                                                    \
+      DOMAIN,                                                                        \
+      FROM_VERSION,                                                                  \
+      TO_VERSION,                                                                    \
+      T,                                                                             \
+      kCudaExecutionProvider,                                                        \
+      (*KernelDefBuilder::Create())                                                  \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())                    \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),                   \
+      onnxruntime::contrib::cuda::GridSample<T, LAYOUT>);
+
 REGISTER_KERNEL_TYPED(float, 1, LAYOUT_NCHW, kMSDomain)
 
 #ifdef ENABLE_CUDA_NHWC_OPS
-REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NHWC, kMSInternalNHWCDomain)
+// Op was introduced in opset 16
+REGISTER_KERNEL_VERSIONED_TYPED(float, 16, 19, LAYOUT_NHWC, kMSInternalNHWCDomain)
+
+// Op was modified to support multiple spatial dimensions in opset 20
+REGISTER_KERNEL_VERSIONED_TYPED(float, 20, 21, LAYOUT_NHWC, kMSInternalNHWCDomain)
+
+// Op spec introduced BFloat16 support in opset 22
+REGISTER_KERNEL_TYPED(float, 22, LAYOUT_NHWC, kMSInternalNHWCDomain)
 #endif
 
 template <typename T, bool IsNHWC>
 GridSample<T, IsNHWC>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
+  opset_start_version_ = info.node().SinceVersion();
+
   std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
   std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
   align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
-  ORT_ENFORCE(mode_str == "bilinear" || mode_str == "nearest" || mode_str == "bicubic",
-              "mode \"", mode_str, "\" not supported, expect bilinear, nearest or bicubic");
-  ORT_ENFORCE(padding_mode_str == "zeros" || padding_mode_str == "border" || padding_mode_str == "reflection",
-              "padding_mode \"", padding_mode_str, "\" not supported, expect zeros, border or reflection");
-  if (mode_str == "bicubic") {
-    mode_i_ = 2;
-  } else if (mode_str == "nearest") {
-    mode_i_ = 1;
+
+  if (opset_start_version_ >= 20) {
+    std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "linear");
+    if (mode_str == "cubic") {
+      mode_i_ = 2;
+    } else if (mode_str == "nearest") {
+      mode_i_ = 1;
+    } else if (mode_str == "linear") {
+      mode_i_ = 0;
+    } else {
+      ORT_THROW("mode \"", mode_str, "\" not supported, expect linear, nearest or cubic");
+    }
   } else {
-    mode_i_ = 0;
+    std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
+    if (mode_str == "bicubic") {
+      mode_i_ = 2;
+    } else if (mode_str == "nearest") {
+      mode_i_ = 1;
+    } else if (mode_str == "bilinear") {
+      mode_i_ = 0;
+    } else {
+      ORT_THROW("mode \"", mode_str, "\" not supported, expect bilinear, nearest or bicubic");
+    }
   }
+
+  ORT_ENFORCE(padding_mode_str == "zeros" || padding_mode_str == "border" || padding_mode_str == "reflection",
+              "padding_mode \"", padding_mode_str, "\" not supported, expect zeros, border or reflection");
   if (padding_mode_str == "reflection") {
     padding_mode_i_ = 2;
   } else if (padding_mode_str == "border") {
@@ -59,44 +97,102 @@
   const Tensor* Grid = context->Input<Tensor>(1);
   const auto& dims_grid = Grid->Shape().GetDims();
 
-  if (dims_input.size() != 4 || dims_grid.size() != 4) {
-    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Only 4-D tensor is supported");
+  if (dims_input.size() != dims_grid.size()) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Input and grid must have the same number of dimensions");
+  }
+
+  if (opset_start_version_ < 20 && dims_input.size() != 4) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Opset 16-19 versions of this op only supports 4-D input tensors");
+  }
+
+  if (dims_input[0] != dims_grid[0]) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Grid batch size does not match input batch size ");
+  }
+
+  if ((dims_input.size() == 4 && dims_grid[3] != 2) || (dims_input.size() == 5 && dims_grid[4] != 3)) {
+    return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT,
+                  "Last dimension of grid input must match the number of "
+                  "spatial dimensions in the input (2 for 2D, 3 for 3D).");
+  }
+
+  if (dims_input.size() != 4 && dims_input.size() != 5) {
+    return Status(common::ONNXRUNTIME, common::NOT_IMPLEMENTED, "Only 4-D and 5-D input tensors are supported");
+  }
+
+  if (dims_input.size() == 5 && mode_i_ == 2) {
+    // This is common for CPU and CUDA to not support Cubic mode for 5D input
+    // So it won't break CUDA users who were previously dropping down to CPU version of the op.
+    return Status(common::ONNXRUNTIME, common::NOT_IMPLEMENTED, "Cubic mode is only supported in 4-D cases.");
   }
-  ORT_ENFORCE(dims_grid[0] == dims_input[0], "Grid batch size ", dims_grid[0], " does not match input batch size ", dims_input[0]);
-  ORT_ENFORCE(dims_grid[3] == 2, "Last dimension of grid: ", dims_grid[3], ", expect 2");
 
   using Ch = Channels<IsNHWC>;
 
-  TensorShapeVector dims_output(4);
-  dims_output[Ch::N] = dims_input[Ch::N];
-  dims_output[Ch::C] = dims_input[Ch::C];
-  dims_output[Ch::H] = dims_grid[1 /* Grid::H */];
-  dims_output[Ch::W] = dims_grid[2 /* Grid::W */];
+  TensorShapeVector dims_output(dims_input.size());
+  if (dims_input.size() == 4) {
+    dims_output[Ch::N] = dims_input[Ch::N];
+    dims_output[Ch::C] = dims_input[Ch::C];
+    dims_output[Ch::H] = dims_grid[1 /* Grid::H */];
+    dims_output[Ch::W] = dims_grid[2 /* Grid::W */];
+  } else {
+    // 5D input - deal with both NCHW and NHWC layouts
+    dims_output[0] = dims_input[0];
+    dims_output[1] = !IsNHWC ? dims_input[1] : dims_grid[1];
+    dims_output[2] = !IsNHWC ? dims_grid[1] : dims_grid[2];
+    dims_output[3] = !IsNHWC ? dims_grid[2] : dims_grid[3];
+    dims_output[4] = !IsNHWC ? dims_grid[3] : dims_input[4];
+  }
   Tensor* Y = context->Output(0, dims_output);
+
   // Return early if the output tensor is going to be of size 0
   if (Y->Shape().Size() == 0) {
     return Status::OK();
   }
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaT* Y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
-  GridSampleImpl<CudaT, IsNHWC>(
-      Stream(context),
-      reinterpret_cast<const CudaT*>(X->Data<T>()),
-      reinterpret_cast<const CudaT*>(Grid->Data<T>()),
-      mode_i_,
-      padding_mode_i_,
-      align_corners_,
-      dims_input.data(),
-      dims_grid[1],
-      dims_grid[2],
-      Y_data);
+
+  if (dims_input.size() == 4) {
+    // sample 2d
+    GridSampleImpl<CudaT, IsNHWC>(
+        Stream(context),
+        reinterpret_cast<const CudaT*>(X->Data<T>()),
+        reinterpret_cast<const CudaT*>(Grid->Data<T>()),
+        mode_i_,
+        padding_mode_i_,
+        align_corners_,
+        dims_input.data(),
+        dims_grid[1],
+        dims_grid[2],
+        Y_data);
+  } else {
+    // sample 3d
+    GridSampleImpl3D<CudaT, IsNHWC>(
+        Stream(context),
+        reinterpret_cast<const CudaT*>(X->Data<T>()),
+        reinterpret_cast<const CudaT*>(Grid->Data<T>()),
+        mode_i_,
+        padding_mode_i_,
+        align_corners_,
+        dims_input.data(),
+        dims_grid[1],
+        dims_grid[2],
+        dims_grid[3],
+        Y_data);
+  }
+
   return Status::OK();
 }
 }  // namespace cuda
 }  // namespace contrib
 
 namespace cuda {
-REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NCHW, kOnnxDomain)
+// Op was introduced in opset 16
+REGISTER_KERNEL_VERSIONED_TYPED(float, 16, 19, LAYOUT_NCHW, kOnnxDomain)
+
+// Op was modified to support multiple spatial dimensions in opset 20
+REGISTER_KERNEL_VERSIONED_TYPED(float, 20, 21, LAYOUT_NCHW, kOnnxDomain)
+
+// Op spec introduced BFloat16 support in opset 22
+REGISTER_KERNEL_TYPED(float, 22, LAYOUT_NCHW, kOnnxDomain)
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/grid_sample.h b/onnxruntime/core/providers/cuda/tensor/grid_sample.h
@@ -22,6 +22,7 @@ class GridSample final : public CudaKernel {
   int64_t mode_i_;          // 0: bilinear (default), 1: nearest 2: bicubic
   int64_t padding_mode_i_;  // 0:'zeros', 1: 'border', 2:'reflection'
   int64_t align_corners_;
+  int opset_start_version_;
 };
 
 }  // namespace cuda