Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1af3abd
Add test model
yuslepukhin Dec 16, 2025
4be2420
Add test model
yuslepukhin Dec 16, 2025
4029951
Py formatting
yuslepukhin Dec 16, 2025
1eda095
Merge branch 'yuslepukhin/pad_rce' of https://github.com/microsoft/on…
yuslepukhin Dec 16, 2025
0b84e6b
Continue testing
yuslepukhin Dec 18, 2025
fdfe2b5
Commit the latest
yuslepukhin Dec 19, 2025
3e711a5
Fix up constant
yuslepukhin Dec 19, 2025
af3a016
More refactoring
yuslepukhin Dec 19, 2025
e1e8fa9
Add instrumentation
yuslepukhin Dec 19, 2025
ec57b37
Add tests
yuslepukhin Dec 19, 2025
96a6045
All Edge tests fail
yuslepukhin Dec 22, 2025
a2570d0
Revert "All Edge tests fail"
yuslepukhin Dec 22, 2025
5a6a979
Add missing Edge tests
yuslepukhin Dec 22, 2025
635c6f8
Fix Reflect and add tests for Wrap
yuslepukhin Dec 23, 2025
096d539
Test pass on CPU
yuslepukhin Dec 23, 2025
0c521ae
Add some cases handling for CUDA
yuslepukhin Dec 23, 2025
b78cfe7
Clamp output dimensions, early exist on zero output
yuslepukhin Dec 23, 2025
f3e68f6
Produce outut before early exist
yuslepukhin Dec 24, 2025
5ef84e2
Wrap is not supported on CUDA. Wrap test must be ver 18
yuslepukhin Dec 24, 2025
8319d47
Still have CUDA failures, not clear why cudaMemset is not working as …
yuslepukhin Dec 25, 2025
f6f15db
Address Copilot review comments and some bugs
yuslepukhin Jan 5, 2026
7e583dc
Address CI failures
yuslepukhin Jan 6, 2026
0058aae
Fix accidentally broken test
yuslepukhin Jan 6, 2026
49e7359
Address missing ep exclusion and re-instate original cast
yuslepukhin Jan 6, 2026
e836ba8
Remove extra files
yuslepukhin Jan 6, 2026
9ce7958
Fix TYPED_TEST macro expansion
yuslepukhin Jan 7, 2026
2c62d9f
Address review and CI failures
yuslepukhin Jan 7, 2026
584876c
Address review comments
yuslepukhin Jan 7, 2026
4d386d0
Skip for DML EP
yuslepukhin Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
405 changes: 263 additions & 142 deletions onnxruntime/core/providers/cpu/tensor/pad.cc

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions onnxruntime/core/providers/cpu/tensor/padbase.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,42 @@ class PadBase {

// End provider shared

// Only flatten innermost axes when there is no padding and no slicing on ANY axis.
static bool ShouldFlattenInnerShape(gsl::span<const int64_t> input_dims,
gsl::span<const int64_t> pads,
gsl::span<const int64_t> slices) {
const size_t rank = input_dims.size();
if (rank == 0) return false;
for (size_t i = 0; i < rank; ++i) {
if (slices[i] != 0 || slices[rank + i] != 0) return false;
}

const size_t inner = rank - 1;
if (pads[inner] != 0 || pads[inner + rank] != 0 ||
slices[inner] != 0 || slices[inner + rank] != 0) {
return false;
}
return true;
}

// Guard: pre-pad + copy + post-pad must equal total output elements.
static Status ValidateTotalElementsCoverage(size_t total_output_elems,
size_t prepad_elems,
size_t copy_elems,
size_t postpad_elems) {
const size_t checked_sum =
SafeInt<size_t>(prepad_elems) +
SafeInt<size_t>(copy_elems) +
SafeInt<size_t>(postpad_elems);
if (checked_sum != total_output_elems) {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"Pad coverage invalid: pre=", prepad_elems,
" copy=", copy_elems, " post=", postpad_elems,
" total=", total_output_elems);
}
return Status::OK();
}

/// <summary>
/// Flatten no padding inner most Axis, so one memcpy cover multiple Axis.
/// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/cuda/cuda_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ template std::unique_ptr<IConstantBuffer<Float8E5M2>> CreateConstantOnes<Float8E
template void Fill<T>(cudaStream_t stream, T * output, T value, int64_t count);

SPECIALIZED_FILL(int8_t)
SPECIALIZED_FILL(bool)
SPECIALIZED_FILL(int16_t)
SPECIALIZED_FILL(int32_t)
SPECIALIZED_FILL(int64_t)
Expand Down
77 changes: 68 additions & 9 deletions onnxruntime/core/providers/cuda/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
typedef typename ToCudaType<T>::MappedType CudaT;
const auto& input_tensor = *ctx->Input<Tensor>(0);
auto const& input_shape = input_tensor.Shape();
int32_t dimension_count = static_cast<int32_t>(input_shape.NumDimensions());
const size_t dimension_count = input_shape.NumDimensions();

const PadsVector* p_pads = &pads_;
const PadsVector* p_slices = &slices_;
Expand Down Expand Up @@ -134,26 +134,85 @@
TArray<int64_t> input_strides(input_pitches);

auto output_dims(input_shape.AsShapeVector());
ORT_ENFORCE(static_cast<size_t>(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values");
ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values");

// Calculate output dimensions, and handle any negative padding
TArray<int64_t> lower_pads(dimension_count);
TArray<int64_t> upper_pads(dimension_count);
for (auto i = 0; i < dimension_count; i++) {
lower_pads[i] = (*p_pads)[i] + (*p_slices)[i];
upper_pads[i] = (*p_pads)[static_cast<int64_t>(i) + dimension_count] + (*p_slices)[static_cast<int64_t>(i) + dimension_count];
output_dims[i] += lower_pads[i] + upper_pads[i];
for (size_t i = 0; i < dimension_count; i++) {
lower_pads[i] = SafeInt<int64_t>((*p_pads)[i]) + (*p_slices)[i];
upper_pads[i] = SafeInt<int64_t>((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count];
output_dims[i] += SafeInt<int64_t>(lower_pads[i]) + upper_pads[i];
}

TensorShapeVector effective_input_extents;
effective_input_extents.reserve(dimension_count);
for (size_t i = 0; i < dimension_count; i++) {
int64_t extent = std::max<int64_t>(SafeInt<int64_t>(input_dims[i]) +

Check warning on line 151 in onnxruntime/core/providers/cuda/tensor/pad.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <algorithm> for max [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/cuda/tensor/pad.cc:151: Add #include <algorithm> for max [build/include_what_you_use] [4]
(*p_slices)[i] + (*p_slices)[i + dimension_count],
0LL);
effective_input_extents.push_back(extent);
}

TensorShape output_shape(output_dims);
auto& output_tensor = *ctx->Output(0, output_shape);

// special case when there is a dim value of 0 in the shape. behavior depends on mode
// If the input size is zero, but output shape is not, need padding only
// this is expected for constant mode only, otherwise the output is empty
// no error
if (input_shape.Size() == 0) {
ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape));
if (mode_ == Mode::Constant) {
const int64_t output_size = output_shape.Size();
if (output_size > 0) {
Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
output_size);
}
}
// No error for other modes (preserve CPU historical behavior),
// but no output should be expected either
return Status::OK();
}

auto& output_tensor = *ctx->Output(0, output_shape);
// Early constant-fill: input is not empty as above
// However, if any effective input extent is zero, no data to copy
// only padding if any.
const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(),
[](int64_t v) { return v == 0; });

if (no_effective_data_to_copy) {
if (mode_ == Mode::Constant) {
// Attempt to pad constant mode in case output is not empty
// all other modes are an error
const int64_t output_size = output_shape.Size();
if (output_size > 0) {
Fill<CudaT>(Stream(ctx), reinterpret_cast<CudaT*>(output_tensor.MutableData<T>()), value,
output_size);
}
return Status::OK();
}
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"Pad: invalid mode: ", static_cast<int>(mode_), " with zero effective input extent");
}

// Special case for Reflect mode: ensure all extents >= 2 after slicing
// otherwise reflection is not possible. Matches numpy behavior as ONNX only
// implies that this would be wrong as the start and end positions should be distinct
// values and with 0 there is not one, and with 1 reflection degenerates into ambiguity.
if (mode_ == Mode::Reflect) {
for (size_t i = 0; i < dimension_count; ++i) {
const int64_t extent = effective_input_extents[i]; // length after slicing
const bool reflect_on_axis =
(*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0;
if (reflect_on_axis && extent < 2) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"Pad reflect requires axis length >= 2 after slicing. Input shape:",
input_shape);
}
}
}

// Case of all pads and slices being zero: just copy input to output
if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) &&
std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) &&
output_shape.Size() > 0) {
Expand All @@ -164,7 +223,7 @@
return Status::OK();
}

if (IsNCHWInputWithPaddingAlongHAndW(static_cast<size_t>(dimension_count), lower_pads, upper_pads)) {
if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) {
// If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW)

// NCHW input
Expand Down
Loading
Loading