microsoft · yuslepukhin · Jan 8, 2026 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -67,6 +67,42 @@ class PadBase {
 
   // End provider shared
 
+  // Only flatten innermost axes when there is no padding and no slicing on ANY axis.
+  static bool ShouldFlattenInnerShape(gsl::span<const int64_t> input_dims,
+                                      gsl::span<const int64_t> pads,
+                                      gsl::span<const int64_t> slices) {
+    const size_t rank = input_dims.size();
+    if (rank == 0) return false;
+    for (size_t i = 0; i < rank; ++i) {
+      if (slices[i] != 0 || slices[rank + i] != 0) return false;
+    }
+
+    const size_t inner = rank - 1;
+    if (pads[inner] != 0 || pads[inner + rank] != 0 ||
+        slices[inner] != 0 || slices[inner + rank] != 0) {
+      return false;
+    }
+    return true;
+  }
+
+  // Guard: pre-pad + copy + post-pad must equal total output elements.
+  static Status ValidateTotalElementsCoverage(size_t total_output_elems,
+                                              size_t prepad_elems,
+                                              size_t copy_elems,
+                                              size_t postpad_elems) {
+    const size_t checked_sum =
+        SafeInt<size_t>(prepad_elems) +
+        SafeInt<size_t>(copy_elems) +
+        SafeInt<size_t>(postpad_elems);
+    if (checked_sum != total_output_elems) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Pad coverage invalid: pre=", prepad_elems,
+                             " copy=", copy_elems, " post=", postpad_elems,
+                             " total=", total_output_elems);
+    }
+    return Status::OK();
+  }
+
   /// <summary>
   /// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
   /// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as

diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu
@@ -81,6 +81,7 @@ template std::unique_ptr<IConstantBuffer<Float8E5M2>> CreateConstantOnes<Float8E
   template void Fill<T>(cudaStream_t stream, T * output, T value, int64_t count);
 
 SPECIALIZED_FILL(int8_t)
+SPECIALIZED_FILL(bool)
 SPECIALIZED_FILL(int16_t)
 SPECIALIZED_FILL(int32_t)
 SPECIALIZED_FILL(int64_t)

diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -94,7 +94,7 @@
   typedef typename ToCudaType<T>::MappedType CudaT;
   const auto& input_tensor = *ctx->Input<Tensor>(0);
   auto const& input_shape = input_tensor.Shape();
-  int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
+  const size_t dimension_count = input_shape.NumDimensions();
 
   const PadsVector* p_pads = &pads_;
   const PadsVector* p_slices = &slices_;
@@ -134,26 +134,85 @@
   TArray<int64_t> input_strides(input_pitches);
 
   auto output_dims(input_shape.AsShapeVector());
-  ORT_ENFORCE(static_cast<size_t>(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
+  ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
 
   // Calculate output dimensions, and handle any negative padding
   TArray<int64_t> lower_pads(dimension_count);
   TArray<int64_t> upper_pads(dimension_count);
-  for (auto i = 0; i < dimension_count; i++) {
-    lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
-    upper_pads[i] = (*p_pads)[static_cast<int64_t>(i) + dimension_count] + (*p_slices)[static_cast<int64_t>(i) + dimension_count];
-    output_dims[i] += lower_pads[i] + upper_pads[i];
+  for (size_t i = 0; i < dimension_count; i++) {
+    lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
+    upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
+    output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
+  }
+
+  TensorShapeVector effective_input_extents;
+  effective_input_extents.reserve(dimension_count);
+  for (size_t i = 0; i < dimension_count; i++) {
+    int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +
+                                           (*p_slices)[i] + (*p_slices)[i + dimension_count],
+                                       0LL);
+    effective_input_extents.push_back(extent);
   }
 
   TensorShape output_shape(output_dims);
+  auto& output_tensor = *ctx->Output(0, output_shape);
 
-  // special case when there is a dim value of 0 in the shape. behavior depends on mode
+  // If the input size is zero, but output shape is not, need padding only
+  // this is expected for constant mode only, otherwise the output is empty
+  // no error
   if (input_shape.Size() == 0) {
     ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
+    if (mode_ == Mode::Constant) {
+      const int64_t output_size = output_shape.Size();
+      if (output_size > 0) {
+        Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+                    output_size);
+      }
+    }
+    // No error for other modes (preserve CPU historical behavior),
+    // but no output should be expected either
+    return Status::OK();
   }
 
-  auto& output_tensor = *ctx->Output(0, output_shape);
+  // Early constant-fill: input is not empty as above
+  // However, if any effective input extent is zero, no data to copy
+  // only padding if any.
+  const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
+                                                     [](int64_t v) { return v == 0; });
+
+  if (no_effective_data_to_copy) {
+    if (mode_ == Mode::Constant) {
+      // Attempt to pad constant mode in case output is not empty
+      // all other modes are an error
+      const int64_t output_size = output_shape.Size();
+      if (output_size > 0) {
+        Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+                    output_size);
+      }
+      return Status::OK();
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
+  }
+
+  // Special case for Reflect mode: ensure all extents >= 2 after slicing
+  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+  // implies that this would be wrong as the start and end positions should be distinct
+  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+  if (mode_ == Mode::Reflect) {
+    for (size_t i = 0; i < dimension_count; ++i) {
+      const int64_t extent = effective_input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0;
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+                               input_shape);
+      }
+    }
+  }
 
+  // Case of all pads and slices being zero: just copy input to output
   if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
       std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
       output_shape.Size() > 0) {
@@ -164,7 +223,7 @@
     return Status::OK();
   }
 
-  if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
+  if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) {
     // If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)
 
     // NCHW input