diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc
index ab261bbb8cdb5..9588f69697cfb 100644
--- a/onnxruntime/core/providers/cpu/tensor/pad.cc
+++ b/onnxruntime/core/providers/cpu/tensor/pad.cc
@@ -347,11 +347,11 @@ void PadBase::FlattenInnerShape(gsl::span<const int64_t> input_dims, gsl::span<c
                                 gsl::span<const int64_t> slices, TensorShapeVector& reshaped_dims) {
   const size_t dims_count = input_dims.size();
   size_t inner_axis = dims_count - 1;
-  size_t inner_size = 1;
+  SafeInt<int64_t> inner_size = 1;
 
   // Find all inner most dimensions that can be flattened.
   do {
-    inner_size *= static_cast<size_t>(input_dims[inner_axis]);
+    inner_size *= input_dims[inner_axis];
 
     if (inner_axis == 0)
       break;
@@ -378,10 +378,32 @@ void PadBase::ReshapePads(gsl::span<const int64_t> src_pad, size_t src_dim_count
             reshaped_pad.begin() + new_dim_count);
 
   // Flatten inner axis.
-  reshaped_pad[inner_axis] = src_pad[inner_axis] * inner_no_pad_size;
-  reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size;
+  reshaped_pad[inner_axis] = SafeInt<int64_t>(src_pad[inner_axis]) * inner_no_pad_size;
+  reshaped_pad[inner_axis + new_dim_count] = SafeInt<int64_t>(src_pad[inner_axis + src_dim_count]) * inner_no_pad_size;
 }
 
+template <typename T>
+struct OutputSink {
+  void operator()(T* output, T value) const {
+#ifdef _DEBUG
+    if (output < beg || output >= end) {
+      ORT_THROW("Pad OutputSink: Output pointer is out of range");
+    }
+#endif
+    *output = value;
+  }
+
+#ifdef _DEBUG
+  OutputSink(T* output, T* output_end)
+      : beg(output), end(output_end) {}
+
+  T* beg;
+  T* end;
+#else
+  OutputSink(T* /* output */, T* /* output_end */) {}
+#endif
+};
+
 // special handling for edge case where the input has one or more dims with value of 0
 template <typename T>
 static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
@@ -406,11 +428,11 @@ static Status PadInputWithDimValueOfZero(OpKernelContext* ctx,
 
 // This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values)
 template <typename T>
-static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
+static void PadAxis(OutputSink<T>& sink, T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch,
                     size_t block_size, size_t block_count) {
   for (size_t block_index = 0; block_index < block_count; block_index++) {
     for (size_t i = 0; i < block_size; i++) {
-      *output++ = *input;
+      sink(output++, *input);
       input += input_delta;
     }
     input += input_pitch;
@@ -420,27 +442,27 @@ static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_
 // These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1,
 // and inputPitch and inputDelta are just a single value added each iteration.
 template <typename T>
-static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
+static void PadInnermostAxis(OutputSink<T>& sink, T* output, T* input, ptrdiff_t input_delta, size_t block_count) {
   for (size_t block_index = 0; block_index < block_count; block_index++) {
-    *output++ = *input;
+    sink(output++, *input);
     input += input_delta;
   }
 }
 
 // For constant padding, there is no input, just a size to write the constant to
 template <typename T>
-static void PadAxisConstant(T* output, T constant, size_t size) {
+static void PadAxisConstant(OutputSink<T>& sink, T* output, T constant, size_t size) {
   if (size == 1) {
-    *output = constant;
+    sink(output, constant);
   } else if (size == 2) {
-    *output = constant;
-    *(output + 1) = constant;
+    sink(output, constant);
+    sink(output + 1, constant);
   } else {
     // This would be faster with SSE instructions.
     // That would mean to have an implementation for each type (uint8, uint32, uint64).
     T* end = output + size;
     for (; output != end;)
-      *output++ = constant;
+      sink(output++, constant);
   }
 }
 
@@ -457,7 +479,7 @@ static Status PadImpl(OpKernelContext* ctx,
   const auto& input_tensor = *ctx->Input<Tensor>(0);
   const auto& orig_input_shape = input_tensor.Shape();
   auto output_dims(orig_input_shape.AsShapeVector());
-  size_t data_rank = output_dims.size();
+  const size_t data_rank = output_dims.size();
 
   // make copy of raw_pads as it may be mutated below
   ORT_ENFORCE(data_rank > 0, "Input tensor has no dimensions");
@@ -465,58 +487,117 @@ static Status PadImpl(OpKernelContext* ctx,
 
   // Reshape input dims
   TensorShapeVector reshaped_input_dims;
-  PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
+  if (PadBase::ShouldFlattenInnerShape(output_dims, pads, slices)) {
+    PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims);
+  } else {
+    reshaped_input_dims = output_dims;
+  }
 
   // Reshape padding
-  size_t new_dims_count = reshaped_input_dims.size();
-  size_t inner_axis = new_dims_count - 1;
-  size_t inner_no_pad_size = onnxruntime::narrow<size_t>(output_dims[inner_axis] > 0
-                                                             ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
-                                                             : 0);
+  const size_t new_dims_count = reshaped_input_dims.size();
+  const size_t inner_axis = new_dims_count - 1;
+  const size_t inner_no_pad_size = narrow<size_t>(output_dims[inner_axis] > 0
+                                                      ? reshaped_input_dims[inner_axis] / output_dims[inner_axis]
+                                                      : 0);
   PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count);
   PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad);
   PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice);
 
   TensorShapeVector reshaped_output_dims = reshaped_input_dims;
   TensorShapeVector input_starts;
-  TensorShapeVector input_extents;
+  TensorShapeVector effective_input_extents;
 
-  // Calculate output dimensions, and handle any negative padding
+  // Calculate reshaped output dimensions, and handle any negative padding
   input_starts.reserve(new_dims_count);
-  input_extents.reserve(new_dims_count);
+  effective_input_extents.reserve(new_dims_count);
   for (size_t i = 0; i < new_dims_count; i++) {
+    // Starts for every dimension. If slice is negative, we need to start further in, handled by the SliceIterator
     input_starts.push_back(-1 * reshaped_slice[i]);
-    input_extents.push_back(reshaped_input_dims[i] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]);
-    reshaped_output_dims[i] += reshaped_pad[i] + reshaped_pad[i + new_dims_count] +
+    // Do not allow negative extents
+    int64_t extent = std::max<int64_t>(SafeInt<int64_t>(reshaped_input_dims[i]) +
+                                           reshaped_slice[i] + reshaped_slice[i + new_dims_count],
+                                       0LL);
+    effective_input_extents.push_back(extent);
+    reshaped_output_dims[i] += SafeInt<int64_t>(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] +
                                reshaped_slice[i] + reshaped_slice[i + new_dims_count];
   }
 
+  // Compute true output dimensions
   for (size_t i = 0; i < data_rank; i++) {
-    output_dims[i] += pads[i] + pads[i + data_rank] + slices[i] + slices[i + data_rank];
+    output_dims[i] += SafeInt<int64_t>(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank];
   }
 
-  // special case an input with one or more dim values of 0. edge case that is easier to handle
-  // separately than to complicate all the code for normal usage.
+  // If the input is empty, but output shape may not be, need padding only
+  // this is expected for constant mode only, otherwise the output is empty
+  // no error
   if (orig_input_shape.Size() == 0) {
     return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value);
   }
 
-  TensorShape input_shape(reshaped_input_dims);
-  SliceIterator<T> input(input_tensor, input_shape, input_starts, input_extents, {});
-
-  // output_shape need to keep original.
+  // output_shape needs to keep original.
   TensorShape output_shape(output_dims);
   auto& output_tensor = *ctx->Output(0, output_shape);
+
+  const SafeInt<size_t> total_output_elems(output_shape.Size());
   auto* output = reinterpret_cast<T*>(output_tensor.MutableDataRaw());
+  auto* output_end = output + static_cast<size_t>(total_output_elems);
+  OutputSink<T> sink(output, output_end);
+
+  // Early constant-fill: if any effective input extent is zero (input is not empty), no data to copy
+  // only padding if any for constant mode, for other modes it is an error
+  const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
+                                                     [](int64_t v) { return v == 0; });
+
+  if (no_effective_data_to_copy) {
+    if (mode == Mode::Constant) {
+      PadAxisConstant<T>(sink, output, value, total_output_elems);
+      return Status::OK();
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Pad: invalid mode: ", static_cast<int>(mode), " with zero effective input extent");
+  }
+
+  // Special case for Reflect mode: ensure all extents >= 2 after slicing
+  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+  // implies that this would be wrong as the start and end positions should be distinct
+  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+  if (mode == Mode::Reflect) {
+    for (size_t i = 0; i < new_dims_count; ++i) {
+      const int64_t extent = effective_input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0);
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+                               orig_input_shape);
+      }
+    }
+  }
 
   TensorPitches output_pitches(reshaped_output_dims);
-  size_t alignSkip = 0;  // Amount to skip to align to where the next input tensor data needs to be written
+  // Initial skip, sum up the start padding on each axis
+  SafeInt<size_t> align_skip = 0;
+  for (size_t i = 0; i < new_dims_count; i++) {
+    const auto inc = SafeInt<int64_t>(reshaped_pad[i]) * output_pitches[i];
+    align_skip += inc;
+  }
+
+  // Validate coverage: pre + copy + post == total
+  SafeInt<size_t> copy_elems = 1;
+  for (size_t i = 0, lim = effective_input_extents.size(); i < lim; ++i) {
+    // All extents are positive here due to the no_data_to_copy check above
+    copy_elems *= effective_input_extents[i];
+  }
 
-  // Initial skip, sum up the begin padding on each axis
-  for (size_t i = 0; i < new_dims_count; i++)
-    alignSkip += SafeInt<size_t>(reshaped_pad[i]) * output_pitches[i];
+  const size_t prepad_elems = align_skip;
+  const size_t postpad_elems = SafeInt<size_t>(total_output_elems) - prepad_elems - copy_elems;
+  ORT_RETURN_IF_ERROR(PadBase::ValidateTotalElementsCoverage(
+      total_output_elems, prepad_elems, copy_elems, postpad_elems));
 
-  ExtentAxisCounters input_counters(input_extents);
+  TensorShape input_shape(reshaped_input_dims);
+  SliceIterator<T> input(input_tensor, input_shape, input_starts, effective_input_extents, {});
+
+  ExtentAxisCounters input_counters(effective_input_extents);
 
   switch (mode) {
     case Mode::Constant:
@@ -524,28 +605,41 @@ static Status PadImpl(OpKernelContext* ctx,
       // On loop entry, 'pad' is already set to the first continuous block of padding, and
       // after every pass through the inner loop it gets set to the next continuous pad size.
       while (input_counters) {
-        output += alignSkip;
+        output += align_skip;
         {
-          T* axisStart = output;
-          output = input.CopyInnermostAxisSolitaryInnerStep(output);
-
-          int64_t prePad = reshaped_pad[inner_axis];
-          int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
-          PadAxisConstant(axisStart - prePad, value, onnxruntime::narrow<size_t>(prePad));
-          PadAxisConstant(output, value, onnxruntime::narrow<size_t>(postPad));
-          output += postPad;
-          alignSkip = onnxruntime::narrow<size_t>(prePad);
+          T* axis_start = output;
+          // Compute the actual number of data elements to copy on the innermost axis (after cropping).
+          const size_t inner_extent = onnxruntime::narrow<size_t>(effective_input_extents[inner_axis]);
+
+          // Copy innermost block. IMPORTANT: do not rely on the returned 'output' to be end-of-the extent.
+          ORT_IGNORE_RETURN_VALUE(input.CopyInnermostAxisSolitaryInnerStep(output));
+
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
+          if (pre_pad > 0) {
+            /// Pre-pad(innermost) retro-fill remains valid(write before row_start).
+            PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), value, pre_pad);
+          }
+          if (post_pad > 0) {
+            PadAxisConstant(sink, axis_start + inner_extent, value, post_pad);
+          }
+          output = axis_start + inner_extent + static_cast<size_t>(post_pad);
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()];
-          int64_t prePad = reshaped_pad[input_counters.Axis()];
-          int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count];
-          PadAxisConstant(axisStart - prePad * inner_pitch, value, SafeInt<std::ptrdiff_t>(prePad) * inner_pitch);
-          PadAxisConstant(output, value, SafeInt<ptrdiff_t>(postPad) * inner_pitch);
-          output += inner_pitch * postPad;
-          alignSkip += inner_pitch * SafeInt<size_t>(prePad);
+          T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()];
+          const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          if (pre_pad > 0) {
+            PadAxisConstant(sink, axis_start - static_cast<ptrdiff_t>(pre_pad * inner_pitch), value, pre_pad * inner_pitch);
+          }
+          if (post_pad > 0) {
+            PadAxisConstant(sink, output, value, post_pad * inner_pitch);
+          }
+          output += inner_pitch * post_pad;
+          align_skip += inner_pitch * pre_pad;
         }
       }
       break;
@@ -555,35 +649,52 @@ static Status PadImpl(OpKernelContext* ctx,
       // On loop entry, 'pad' is already set to the first continuous block of padding, and
       // after every pass through the inner loop it gets set to the next continuous pad size.
       while (input_counters) {
-        output += alignSkip;
+        output += align_skip;
         {
-          T* axisStart = output;
+          const SafeInt<size_t> inner_extent = effective_input_extents[inner_axis];
+          T* axis_start = output;
+          T* axis_end = axis_start + onnxruntime::narrow<ptrdiff_t>(inner_extent);
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          int64_t prePad = reshaped_pad[inner_axis];
-          int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (inner_no_pad_size == 1) {
-            PadAxisConstant(axisStart - prePad, *axisStart, onnxruntime::narrow<size_t>(prePad));
-            PadAxisConstant(output, *(output - 1), onnxruntime::narrow<size_t>(postPad));
+            if (pre_pad > 0) {
+              PadAxisConstant(sink, axis_start - static_cast<size_t>(pre_pad), *axis_start, pre_pad);
+            }
+            if (post_pad > 0) {
+              PadAxisConstant(sink, output, *(output - 1), post_pad);
+            }
           } else {
             // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode.
             // Also general loop below after handling first pad axis with non-pad axis works fine.
-            PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow<size_t>(pads[inner_axis]));
-            PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+            if (pads[inner_axis] > 0) {
+              PadAxis(sink, axis_start - static_cast<size_t>(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+            }
+            if (pads[inner_axis + data_rank] > 0) {
+              PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+            }
           }
-          output += postPad;
-          alignSkip = onnxruntime::narrow<size_t>(prePad);
+          output = axis_end + static_cast<size_t>(post_pad);
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()];
-          int64_t prePad = reshaped_pad[input_counters.Axis()];
-          int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count];
-          PadAxis(axisStart - prePad * inner_pitch, axisStart, 1, -inner_pitch, inner_pitch, onnxruntime::narrow<size_t>(prePad));
-          PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, onnxruntime::narrow<size_t>(postPad));
-          output += inner_pitch * postPad;
-          alignSkip += inner_pitch * SafeInt<size_t>(prePad);
+          T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()];
+          const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          if (pre_pad > 0) {
+            PadAxis(sink, axis_start - static_cast<size_t>(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch,
+                    pre_pad);
+          }
+          if (post_pad > 0) {
+            PadAxis(sink, output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad);
+          }
+          output += inner_pitch * post_pad;
+          align_skip += inner_pitch * pre_pad;
         }
       }
       break;
@@ -594,97 +705,107 @@ static Status PadImpl(OpKernelContext* ctx,
       // On loop entry, 'pad' is already set to the first continuous block of padding, and
       // after every pass through the inner loop it gets set to the next continuous pad size.
       while (input_counters) {
-        output += alignSkip;
+        output += align_skip;
         {
-          T* axisStart = output;
+          T* axis_start = output;
           output = input.CopyInnermostAxisSolitaryInnerStep(output);
 
-          int64_t prePad = reshaped_pad[inner_axis];
-          int64_t postPad = reshaped_pad[inner_axis + new_dims_count];
+          const SafeInt<size_t> pre_pad = reshaped_pad[inner_axis];
+          const SafeInt<size_t> post_pad = reshaped_pad[inner_axis + new_dims_count];
           if (inner_no_pad_size == 1) {
             if (mode == Mode::Reflect) {
-              PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, onnxruntime::narrow<size_t>(prePad));
-              PadInnermostAxis(output, output - 2, -1 /* inputDelta */, onnxruntime::narrow<size_t>(postPad));
+              if (pre_pad > 0) {
+                PadInnermostAxis(sink, axis_start - static_cast<size_t>(pre_pad),
+                                 axis_start + static_cast<size_t>(pre_pad), -1 /* inputDelta */, pre_pad);
+              }
+              if (post_pad > 0) {
+                PadInnermostAxis(sink, output, output - 2, -1 /* inputDelta */, post_pad);
+              }
             } else {
-              PadInnermostAxis(axisStart - prePad, output - prePad, 1 /* inputDelta */, onnxruntime::narrow<size_t>(prePad));
-              PadInnermostAxis(output, axisStart, 1 /* inputDelta */, onnxruntime::narrow<size_t>(postPad));
+              if (pre_pad > 0) {
+                PadInnermostAxis(sink, axis_start - static_cast<size_t>(pre_pad),
+                                 output - static_cast<size_t>(pre_pad), 1 /* inputDelta */, pre_pad);
+              }
+              if (post_pad > 0) {
+                PadInnermostAxis(sink, output, axis_start, 1 /* inputDelta */, post_pad);
+              }
             }
           } else {
             // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode.
             if (mode == Mode::Reflect) {
-              PadAxis(
-                  axisStart - prePad,
-                  axisStart + prePad,
-                  1,
-                  -ptrdiff_t(inner_no_pad_size * 2),
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis]));
-              PadAxis(
-                  output,
-                  output - 2 * inner_no_pad_size,
-                  1,
-                  -ptrdiff_t(inner_no_pad_size * 2),
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+              PadAxis(sink,
+                      axis_start - static_cast<size_t>(pre_pad),
+                      axis_start + static_cast<size_t>(pre_pad),
+                      1,
+                      -ptrdiff_t(inner_no_pad_size * 2),
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+              PadAxis(sink,
+                      output,
+                      output - 2 * inner_no_pad_size,
+                      1,
+                      -ptrdiff_t(inner_no_pad_size * 2),
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             } else {
-              PadAxis(
-                  axisStart - prePad,
-                  output - pads[inner_axis] * inner_no_pad_size,
-                  1,
-                  0,
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis]));
-              PadAxis(
-                  output,
-                  axisStart,
-                  1,
-                  0,
-                  inner_no_pad_size,
-                  onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
+              PadAxis(sink,
+                      axis_start - static_cast<size_t>(pre_pad),
+                      output - pads[inner_axis] * inner_no_pad_size,
+                      1,
+                      0,
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis]));
+              PadAxis(sink,
+                      output,
+                      axis_start,
+                      1,
+                      0,
+                      inner_no_pad_size,
+                      onnxruntime::narrow<size_t>(pads[inner_axis + data_rank]));
             }
           }
-          output += postPad;
-          alignSkip = onnxruntime::narrow<size_t>(prePad);
+          output += post_pad;
+          align_skip = pre_pad;
         }
         // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done)
         while (input_counters.Increment()) {
           ptrdiff_t inner_pitch = onnxruntime::narrow<std::ptrdiff_t>(output_pitches[input_counters.Axis()]);
-          T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()];
-          int64_t prePad = reshaped_pad[input_counters.Axis()];
-          int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count];
+          T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()];
+          const SafeInt<size_t> pre_pad = reshaped_pad[input_counters.Axis()];
+          const SafeInt<size_t> post_pad = reshaped_pad[input_counters.Axis() + new_dims_count];
           if (mode == Mode::Reflect) {
-            PadAxis(
-                axisStart - prePad * inner_pitch,
-                axisStart + prePad * inner_pitch,
-                1,
-                -inner_pitch * 2,
-                inner_pitch,
-                onnxruntime::narrow<size_t>(prePad));
-            PadAxis(
-                output,
-                output - 2 * inner_pitch,
-                1,
-                -inner_pitch * 2,
-                inner_pitch,
-                onnxruntime::narrow<size_t>(postPad));
+            PadAxis(sink,
+                    axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
+                    axis_start + static_cast<size_t>(pre_pad) * inner_pitch,
+                    1,
+                    -inner_pitch * 2,
+                    inner_pitch,
+                    pre_pad);
+            PadAxis(sink,
+                    output,
+                    output - 2 * inner_pitch,
+                    1,
+                    -inner_pitch * 2,
+                    inner_pitch,
+                    post_pad);
           } else {
-            PadAxis(
-                axisStart - prePad * inner_pitch,
-                output - prePad * inner_pitch,
-                1,
-                0,
-                inner_pitch,
-                onnxruntime::narrow<size_t>(prePad));
-            PadAxis(
-                output,
-                axisStart,
-                1,
-                0,
-                inner_pitch,
-                onnxruntime::narrow<size_t>(postPad));
+            PadAxis(sink,
+                    axis_start - static_cast<size_t>(pre_pad) * inner_pitch,
+                    output - static_cast<size_t>(pre_pad) * inner_pitch,
+                    1,
+                    0,
+                    inner_pitch,
+                    pre_pad);
+            PadAxis(sink,
+                    output,
+                    axis_start,
+                    1,
+                    0,
+                    inner_pitch,
+                    post_pad);
           }
-          output += inner_pitch * postPad;
-          alignSkip += inner_pitch * SafeInt<size_t>(prePad);
+          output += inner_pitch * post_pad;
+          align_skip += inner_pitch * pre_pad;
         }
       }
       break;
diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h
index 43f9cbfc9f9a4..e2ab6ff6c8fb1 100644
--- a/onnxruntime/core/providers/cpu/tensor/padbase.h
+++ b/onnxruntime/core/providers/cpu/tensor/padbase.h
@@ -67,6 +67,42 @@ class PadBase {
 
   // End provider shared
 
+  // Only flatten innermost axes when there is no padding and no slicing on ANY axis.
+  static bool ShouldFlattenInnerShape(gsl::span<const int64_t> input_dims,
+                                      gsl::span<const int64_t> pads,
+                                      gsl::span<const int64_t> slices) {
+    const size_t rank = input_dims.size();
+    if (rank == 0) return false;
+    for (size_t i = 0; i < rank; ++i) {
+      if (slices[i] != 0 || slices[rank + i] != 0) return false;
+    }
+
+    const size_t inner = rank - 1;
+    if (pads[inner] != 0 || pads[inner + rank] != 0 ||
+        slices[inner] != 0 || slices[inner + rank] != 0) {
+      return false;
+    }
+    return true;
+  }
+
+  // Guard: pre-pad + copy + post-pad must equal total output elements.
+  static Status ValidateTotalElementsCoverage(size_t total_output_elems,
+                                              size_t prepad_elems,
+                                              size_t copy_elems,
+                                              size_t postpad_elems) {
+    const size_t checked_sum =
+        SafeInt<size_t>(prepad_elems) +
+        SafeInt<size_t>(copy_elems) +
+        SafeInt<size_t>(postpad_elems);
+    if (checked_sum != total_output_elems) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Pad coverage invalid: pre=", prepad_elems,
+                             " copy=", copy_elems, " post=", postpad_elems,
+                             " total=", total_output_elems);
+    }
+    return Status::OK();
+  }
+
   /// <summary>
   /// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
   /// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu
index 934425656e3c9..59f2deda1805e 100644
--- a/onnxruntime/core/providers/cuda/cuda_utils.cu
+++ b/onnxruntime/core/providers/cuda/cuda_utils.cu
@@ -81,6 +81,7 @@ template std::unique_ptr<IConstantBuffer<Float8E5M2>> CreateConstantOnes<Float8E
   template void Fill<T>(cudaStream_t stream, T * output, T value, int64_t count);
 
 SPECIALIZED_FILL(int8_t)
+SPECIALIZED_FILL(bool)
 SPECIALIZED_FILL(int16_t)
 SPECIALIZED_FILL(int32_t)
 SPECIALIZED_FILL(int64_t)
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
index bdd6567d2ef34..656890e796a1c 100644
--- a/onnxruntime/core/providers/cuda/tensor/pad.cc
+++ b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -94,7 +94,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const auto& input_tensor = *ctx->Input<Tensor>(0);
   auto const& input_shape = input_tensor.Shape();
-  int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
+  const size_t dimension_count = input_shape.NumDimensions();
 
   const PadsVector* p_pads = &pads_;
   const PadsVector* p_slices = &slices_;
@@ -134,26 +134,85 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
   TArray<int64_t> input_strides(input_pitches);
 
   auto output_dims(input_shape.AsShapeVector());
-  ORT_ENFORCE(static_cast<size_t>(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
+  ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
 
   // Calculate output dimensions, and handle any negative padding
   TArray<int64_t> lower_pads(dimension_count);
   TArray<int64_t> upper_pads(dimension_count);
-  for (auto i = 0; i < dimension_count; i++) {
-    lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
-    upper_pads[i] = (*p_pads)[static_cast<int64_t>(i) + dimension_count] + (*p_slices)[static_cast<int64_t>(i) + dimension_count];
-    output_dims[i] += lower_pads[i] + upper_pads[i];
+  for (size_t i = 0; i < dimension_count; i++) {
+    lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
+    upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
+    output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
+  }
+
+  TensorShapeVector effective_input_extents;
+  effective_input_extents.reserve(dimension_count);
+  for (size_t i = 0; i < dimension_count; i++) {
+    int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +
+                                           (*p_slices)[i] + (*p_slices)[i + dimension_count],
+                                       0LL);
+    effective_input_extents.push_back(extent);
   }
 
   TensorShape output_shape(output_dims);
+  auto& output_tensor = *ctx->Output(0, output_shape);
 
-  // special case when there is a dim value of 0 in the shape. behavior depends on mode
+  // If the input size is zero, but output shape is not, need padding only
+  // this is expected for constant mode only, otherwise the output is empty
+  // no error
   if (input_shape.Size() == 0) {
     ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
+    if (mode_ == Mode::Constant) {
+      const int64_t output_size = output_shape.Size();
+      if (output_size > 0) {
+        Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+                    output_size);
+      }
+    }
+    // No error for other modes (preserve CPU historical behavior),
+    // but no output should be expected either
+    return Status::OK();
   }
 
-  auto& output_tensor = *ctx->Output(0, output_shape);
+  // Early constant-fill: input is not empty as above
+  // However, if any effective input extent is zero, no data to copy
+  // only padding if any.
+  const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
+                                                     [](int64_t v) { return v == 0; });
+
+  if (no_effective_data_to_copy) {
+    if (mode_ == Mode::Constant) {
+      // Attempt to pad constant mode in case output is not empty
+      // all other modes are an error
+      const int64_t output_size = output_shape.Size();
+      if (output_size > 0) {
+        Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
+                    output_size);
+      }
+      return Status::OK();
+    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
+  }
+
+  // Special case for Reflect mode: ensure all extents >= 2 after slicing
+  // otherwise reflection is not possible. Matches numpy behavior as ONNX only
+  // implies that this would be wrong as the start and end positions should be distinct
+  // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
+  if (mode_ == Mode::Reflect) {
+    for (size_t i = 0; i < dimension_count; ++i) {
+      const int64_t extent = effective_input_extents[i];  // length after slicing
+      const bool reflect_on_axis =
+          (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0;
+      if (reflect_on_axis && extent < 2) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "Pad reflect requires axis length >= 2 after slicing. Input shape:",
+                               input_shape);
+      }
+    }
+  }
 
+  // Case of all pads and slices being zero: just copy input to output
   if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
       std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
       output_shape.Size() > 0) {
@@ -164,7 +223,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
     return Status::OK();
   }
 
-  if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
+  if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) {
     // If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)
 
     // NCHW input
diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
index 1d9cd15f53327..49c9d360f9046 100644
--- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc
@@ -763,7 +763,7 @@ edge
 
 // test handling of input with a 0 for a dimension
 TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
-  // TODO: Unskip when fixed #41968513
+  // TODO: Unskip Dml when fixed #41968513
   if (DefaultDmlExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 13, which exceeds threshold";
   }
@@ -774,49 +774,56 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
                                   {1, 1},
                                   T(1),
                                   {2},
-                                  {T(1), T(1)});
+                                  {T(1), T(1)},
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D empty pads
                                   {},
                                   {0, 0},
                                   T(1),
                                   {0},
-                                  {});
+                                  {},
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D offsetting pads
                                   {},
                                   {-1, 1},
                                   T(1),
                                   {0},
-                                  {});
+                                  {},
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
                                   {1, 1, 1, 1},
                                   T(1),
                                   {4, 2},
-                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
                                   {},
                                   {1, 1, 1, 1},
                                   T(1),
                                   {2, 4},
-                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({0, 2},
                                   {},
                                   {1, 0, 1, 0},  // empty pads for dim 1
                                   T(1),
                                   {2, 2},
-                                  {T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1)},
+                                  "constant");
 
   RunAllOpsetAllDomainPadTests<T>({2, 0, 2},  // 3D
                                   {},
                                   {0, 1, 0, 0, 1, 0},
                                   T(1),
                                   {2, 2, 2},
-                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)});
+                                  {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)},
+                                  "constant");
 }
 // Added output shape verification b/w the output shape generated by operator specific ONNX inference and
 // the output shape generated by operator specific ORT implementation. After adding this verification,
@@ -836,11 +843,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) {
 //      In order to remove the warning, shape inference methods needs to be fixed.
 
 TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
+  // TODO: Enable Dml when fixed #41968513
   using T = TypeParam;
   RunAllOpsetAllDomainPadTests<T>({0},  // 1D
                                   {},
@@ -850,7 +853,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {},
                                   "edge",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -860,7 +864,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {},
                                   "edge",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
@@ -878,7 +883,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   {},
                                   "edge",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 
   RunAllOpsetAllDomainPadTests<T>({2, 2, 0},  // 3D
                                   {},
@@ -886,24 +892,26 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) {
                                   T(1),
                                   {2, 4, 0},
                                   {},
-                                  "edge");
+                                  "edge",
+                                  OpTester::ExpectResult::kExpectSuccess, "",
+                                  {kDmlExecutionProvider});
 }
 
 TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
-  }
-
   using T = TypeParam;
+  // DML: Unskip when fixed #41968513
   RunAllOpsetAllDomainPadTests<T>({2, 0},  // 2D
                                   {},
                                   {1, 0, 1, 0},  // allowed if it doesn't pad the empty dim
                                   T(1),
                                   {4, 0},
                                   {},
-                                  "reflect");
+                                  "reflect",
+                                  OpTester::ExpectResult::kExpectSuccess,
+                                  "",
+                                  {kDmlExecutionProvider});
 
+  // DML: Unskip when fixed #41968513
   RunAllOpsetAllDomainPadTests<T>({0, 2, 1},  // 3D
                                   {},
                                   {1, 1, 1, 1, 1, 1},  // not allowed if it pads the empty dim
@@ -912,7 +920,8 @@ TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) {
                                   {},
                                   "reflect",
                                   OpTester::ExpectResult::kExpectFailure,
-                                  "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", {kTensorrtExecutionProvider});
+                                  "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}",
+                                  {kDmlExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(PadOpTest, BoolType) {
@@ -1089,5 +1098,308 @@ TEST(PadOpTest, ConstantPadNegativeAxes) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider});
 }
 
+TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) {
+  OpTester test("Pad", 18);
+  test.AddAttribute("mode", "constant");
+
+  const std::vector<int64_t> input_shape = {1, 1, 4, 4};
+
+  const std::vector<float> input_data = {
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f,
+      9.0f, 10.0f, 11.0f, 12.0f,
+      13.0f, 14.0f, 15.0f, 16.0f};
+
+  // Calculate expected shape:
+  // dim0: 1 + 0 + 0 = 1
+  // dim1: 1 + 0 + 0 = 1
+  // dim2: 4 + -4 + 4 = 4
+  // dim3: 4 + 0 + 0 = 4
+  const std::vector<int64_t> expected_shape = {1, 1, 4, 4};
+  const std::vector<float> expected_data = {
+      0.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 0.f,
+      0.f, 0.f, 0.f, 0.f};
+
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0}, true);
+  test.AddInput<float>("constant_value", {}, {0.0f}, true);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.Run();
+}
+
+TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) {
+  OpTester test("Pad", 18);
+  test.AddAttribute("mode", "constant");
+
+  const std::initializer_list<int64_t> input_shape{2, 18, 4};
+
+  /* clang-format off */
+  const std::vector<float> input_data = {
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+      1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6,
+  };
+  /* clang-format on */
+
+  // input_data is larger than the shape elements in this test
+  // constexpr const size_t input_data_size = static_cast<size_t>(2) * 18 * 4;
+  // ASSERT_EQ(input_data_size, input_data.size());
+  auto input_span = gsl::make_span(input_data.data(), static_cast<size_t>(2) * 18 * 4);
+
+  const std::initializer_list<int64_t> pads_shape{6};
+  std::initializer_list<int64_t> pads = {1, 0x100000, -2, -3, 0, 1};
+  ASSERT_EQ(6U, pads.size());
+
+  // Expected shape is as follows:
+  // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output
+  // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 1,048,594
+  // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3
+  // Resulting shape is {0, 1048594, 3} with 0 at the front.
+  // How do we handle zero shapes? Currently ONNX spec allows it.
+  // We choose to produce a empty tensor
+  constexpr int64_t dim0 = 2LL + 1 - 3;
+  constexpr int64_t dim1 = 18LL + 0x100000 - 0;
+  constexpr int64_t dim2 = 4LL + -2 + 1;
+  const std::initializer_list<int64_t> output_shape{dim0, dim1, dim2};
+
+  std::vector<float> output_data;  // empty now
+
+  test.AddInput<float>("data", input_shape, input_span);
+  test.AddInput<int64_t>("pads", pads_shape, pads, true);
+  test.AddInput<float>("value", {}, {100.f}, true);
+
+  // Omit Axis input
+  test.AddOutput<float>("output", output_shape, output_data);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+}
+
+TEST(PadOpTest, ConstantMode_MixedSigns_Small) {
+  const std::vector<int64_t> input_shape{2, 6, 4};
+  std::vector<float> input_data(2 * 6 * 4);
+
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>((i % 5) + 1);
+  }
+
+  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+  const float cv = 9.0f;
+  const std::vector<int64_t> expected_shape{2, 9, 3};
+
+  const std::vector<float> expected_output{
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      9.f, 9.f, 9.f,
+      3.f, 4.f, 9.f,
+      2.f, 3.f, 9.f,
+      1.f, 2.f, 9.f,
+      5.f, 1.f, 9.f,
+      4.f, 5.f, 9.f,
+      3.f, 4.f, 9.f};
+
+  ASSERT_EQ(2U * 9U * 3U, expected_output.size());
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddInput<float>("constant_value", {}, {cv}, true);
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "constant");
+  test.ConfigExcludeEps({kDmlExecutionProvider});
+  test.RunWithConfig();
+}
+
+TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) {
+  const std::vector<int64_t> input_shape{2, 3, 5};
+
+  std::vector<float> input_data(2 * 3 * 5);
+  std::iota(input_data.begin(), input_data.end(), 1.0f);
+
+  const std::vector<int64_t> pads{1, 3, -2, -1, 0, 1};
+  const float cv = 9.0f;
+  const std::vector<int64_t> expected_shape{2, 6, 4};
+
+  const std::vector<float> expected_output{
+      // depth 0
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+
+      // depth 1
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      9.0F, 9.0F, 9.0F, 9.0F,
+      3.0F, 4.0F, 5.0F, 9.0F,
+      8.0F, 9.0F, 10.0F, 9.0F,
+      13.0F, 14.0F, 15.0F, 9.0F};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddInput<float>("constant_value", {}, {cv}, true);
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "constant");
+  test.ConfigExcludeEps({kDmlExecutionProvider});
+  test.RunWithConfig();
+}
+
+TEST(PadOpTest, EdgeMode_ZeroExtentFails) {
+  std::vector<int64_t> input_shape = {4};
+  // Generate input as above
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<int64_t> pads = {-4, 3};
+
+  const std::vector<int64_t> expected_shape{3};
+  const std::vector<float> expected_data = {1.f, 2.f, 3.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "edge");
+  test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.Config(OpTester::ExpectResult::kExpectFailure, "");
+  test.RunWithConfig();
+}
+
+TEST(PadOpTest, EdgeMode_ExtentOne_Valid) {
+  const std::vector<int64_t> input_shape{4};
+  const std::vector<float> input_data{1.f, 1.f, 1.f, 1.f};
+  const std::vector<int64_t> pads{-3, 3};
+  const std::vector<int64_t> expected_shape{4};
+  const std::vector<float> expected_output{1.f, 1.f, 1.f, 1.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "edge");
+  test.Run();
+}
+
+TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) {
+  // Shape chosen to force FlattenInnerShape():
+  // innermost dims {2,4} -> flattened to 8
+  const std::vector<int64_t> input_shape = {2, 3, 2, 4};
+
+  std::vector<float> input_data(2 * 3 * 2 * 4);
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    input_data[i] = static_cast<float>(i);
+  }
+
+  // ONNX pad order: [b0,b1,b2,b3,e0,e1,e2,e3]
+  // The below shape will cause flattening the last two input dims to 8
+  const std::vector<int64_t> pads = {
+      0, 0, 0, 0,  // begin
+      0, 0, 0, 1   // end pad only on last original axis
+  };
+
+  // Expected shape:
+  // flattened axis grows from 8 -> 12
+  const std::vector<int64_t> expected_shape = {2, 3, 2, 5};
+
+  std::vector<float> expected_output = {
+      // [0][0][0]
+      0.f, 1.f, 2.f, 3.f, 3.f,
+      // [0][0][1]
+      4.f, 5.f, 6.f, 7.f, 7.f,
+
+      // [0][1][0]
+      8.f, 9.f, 10.f, 11.f, 11.f,
+      // [0][1][1]
+      12.f, 13.f, 14.f, 15.f, 15.f,
+
+      // [0][2][0]
+      16.f, 17.f, 18.f, 19.f, 19.f,
+      // [0][2][1]
+      20.f, 21.f, 22.f, 23.f, 23.f,
+
+      // [1][0][0]
+      24.f, 25.f, 26.f, 27.f, 27.f,
+      // [1][0][1]
+      28.f, 29.f, 30.f, 31.f, 31.f,
+
+      // [1][1][0]
+      32.f, 33.f, 34.f, 35.f, 35.f,
+      // [1][1][1]
+      36.f, 37.f, 38.f, 39.f, 39.f,
+
+      // [1][2][0]
+      40.f, 41.f, 42.f, 43.f, 43.f,
+      // [1][2][1]
+      44.f, 45.f, 46.f, 47.f, 47.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddOutput<float>("output", expected_shape, expected_output);
+  test.AddAttribute("mode", "edge");
+  test.Run();
+}
+
+// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828
+TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) {
+  const std::vector<int64_t> input_shape = {4};
+  const std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  const std::vector<int64_t> pads = {-3, 3};
+  const std::vector<int64_t> expected_shape{4};
+  const std::vector<float> expected_data = {2.f, 3.f, 4.f, 1.f};
+
+  OpTester test("Pad", 18);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "reflect");
+  test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider,
+                         kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.Config(OpTester::ExpectResult::kExpectFailure,
+              "Pad reflect requires axis length >= 2 after slicing");
+  test.RunWithConfig();
+}
+
+TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) {
+  const std::vector<int64_t> input_shape = {4};
+  const std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f};
+  const std::vector<int64_t> pads = {-3, 3};
+
+  const std::vector<int64_t> expected_shape{4};
+  // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4]
+  const std::vector<float> expected_data = {4, 4, 4, 4};
+
+  // CUDA registers only up to 18 and does not impl wrap mode
+  // so we force version to 19 to automatically exclude EPs that do not
+  // implement wrap mode similar to the above tests.
+  OpTester test("Pad", 19);
+  test.AddInput<float>("data", input_shape, input_data);
+  test.AddInput<int64_t>("pads", {static_cast<int64_t>(pads.size())}, pads, true);
+  test.AddOutput<float>("output", expected_shape, expected_data);
+  test.AddAttribute("mode", "wrap");
+  test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider,
+                         kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+  test.RunWithConfig();
+}
+
 }  // namespace test
 }  // namespace onnxruntime