diff --git a/onnxruntime/core/providers/cpu/tensor/pad.cc b/onnxruntime/core/providers/cpu/tensor/pad.cc index ab261bbb8cdb5..9588f69697cfb 100644 --- a/onnxruntime/core/providers/cpu/tensor/pad.cc +++ b/onnxruntime/core/providers/cpu/tensor/pad.cc @@ -347,11 +347,11 @@ void PadBase::FlattenInnerShape(gsl::span input_dims, gsl::span slices, TensorShapeVector& reshaped_dims) { const size_t dims_count = input_dims.size(); size_t inner_axis = dims_count - 1; - size_t inner_size = 1; + SafeInt inner_size = 1; // Find all inner most dimensions that can be flattened. do { - inner_size *= static_cast(input_dims[inner_axis]); + inner_size *= input_dims[inner_axis]; if (inner_axis == 0) break; @@ -378,10 +378,32 @@ void PadBase::ReshapePads(gsl::span src_pad, size_t src_dim_count reshaped_pad.begin() + new_dim_count); // Flatten inner axis. - reshaped_pad[inner_axis] = src_pad[inner_axis] * inner_no_pad_size; - reshaped_pad[inner_axis + new_dim_count] = src_pad[inner_axis + src_dim_count] * inner_no_pad_size; + reshaped_pad[inner_axis] = SafeInt(src_pad[inner_axis]) * inner_no_pad_size; + reshaped_pad[inner_axis + new_dim_count] = SafeInt(src_pad[inner_axis + src_dim_count]) * inner_no_pad_size; } +template +struct OutputSink { + void operator()(T* output, T value) const { +#ifdef _DEBUG + if (output < beg || output >= end) { + ORT_THROW("Pad OutputSink: Output pointer is out of range"); + } +#endif + *output = value; + } + +#ifdef _DEBUG + OutputSink(T* output, T* output_end) + : beg(output), end(output_end) {} + + T* beg; + T* end; +#else + OutputSink(T* /* output */, T* /* output_end */) {} +#endif +}; + // special handling for edge case where the input has one or more dims with value of 0 template static Status PadInputWithDimValueOfZero(OpKernelContext* ctx, @@ -406,11 +428,11 @@ static Status PadInputWithDimValueOfZero(OpKernelContext* ctx, // This is the general padding method to n-dimensionally do edge or reflection padding (based on the inputDelta values) template -static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch, +static void PadAxis(OutputSink& sink, T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_pitch, size_t block_size, size_t block_count) { for (size_t block_index = 0; block_index < block_count; block_index++) { for (size_t i = 0; i < block_size; i++) { - *output++ = *input; + sink(output++, *input); input += input_delta; } input += input_pitch; @@ -420,27 +442,27 @@ static void PadAxis(T* output, T* input, ptrdiff_t input_delta, ptrdiff_t input_ // These are optimizations of PadAxis. The inner loop is removed since the innermost axis has a blockSize of 1, // and inputPitch and inputDelta are just a single value added each iteration. template -static void PadInnermostAxis(T* output, T* input, ptrdiff_t input_delta, size_t block_count) { +static void PadInnermostAxis(OutputSink& sink, T* output, T* input, ptrdiff_t input_delta, size_t block_count) { for (size_t block_index = 0; block_index < block_count; block_index++) { - *output++ = *input; + sink(output++, *input); input += input_delta; } } // For constant padding, there is no input, just a size to write the constant to template -static void PadAxisConstant(T* output, T constant, size_t size) { +static void PadAxisConstant(OutputSink& sink, T* output, T constant, size_t size) { if (size == 1) { - *output = constant; + sink(output, constant); } else if (size == 2) { - *output = constant; - *(output + 1) = constant; + sink(output, constant); + sink(output + 1, constant); } else { // This would be faster with SSE instructions. // That would mean to have an implementation for each type (uint8, uint32, uint64). T* end = output + size; for (; output != end;) - *output++ = constant; + sink(output++, constant); } } @@ -457,7 +479,7 @@ static Status PadImpl(OpKernelContext* ctx, const auto& input_tensor = *ctx->Input(0); const auto& orig_input_shape = input_tensor.Shape(); auto output_dims(orig_input_shape.AsShapeVector()); - size_t data_rank = output_dims.size(); + const size_t data_rank = output_dims.size(); // make copy of raw_pads as it may be mutated below ORT_ENFORCE(data_rank > 0, "Input tensor has no dimensions"); @@ -465,58 +487,117 @@ static Status PadImpl(OpKernelContext* ctx, // Reshape input dims TensorShapeVector reshaped_input_dims; - PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); + if (PadBase::ShouldFlattenInnerShape(output_dims, pads, slices)) { + PadBase::FlattenInnerShape(output_dims, pads, slices, reshaped_input_dims); + } else { + reshaped_input_dims = output_dims; + } // Reshape padding - size_t new_dims_count = reshaped_input_dims.size(); - size_t inner_axis = new_dims_count - 1; - size_t inner_no_pad_size = onnxruntime::narrow(output_dims[inner_axis] > 0 - ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] - : 0); + const size_t new_dims_count = reshaped_input_dims.size(); + const size_t inner_axis = new_dims_count - 1; + const size_t inner_no_pad_size = narrow(output_dims[inner_axis] > 0 + ? reshaped_input_dims[inner_axis] / output_dims[inner_axis] + : 0); PadsVector reshaped_pad(2 * new_dims_count), reshaped_slice(2 * new_dims_count); PadBase::ReshapePads(pads, data_rank, new_dims_count, inner_no_pad_size, reshaped_pad); PadBase::ReshapePads(slices, data_rank, new_dims_count, inner_no_pad_size, reshaped_slice); TensorShapeVector reshaped_output_dims = reshaped_input_dims; TensorShapeVector input_starts; - TensorShapeVector input_extents; + TensorShapeVector effective_input_extents; - // Calculate output dimensions, and handle any negative padding + // Calculate reshaped output dimensions, and handle any negative padding input_starts.reserve(new_dims_count); - input_extents.reserve(new_dims_count); + effective_input_extents.reserve(new_dims_count); for (size_t i = 0; i < new_dims_count; i++) { + // Starts for every dimension. If slice is negative, we need to start further in, handled by the SliceIterator input_starts.push_back(-1 * reshaped_slice[i]); - input_extents.push_back(reshaped_input_dims[i] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]); - reshaped_output_dims[i] += reshaped_pad[i] + reshaped_pad[i + new_dims_count] + + // Do not allow negative extents + int64_t extent = std::max(SafeInt(reshaped_input_dims[i]) + + reshaped_slice[i] + reshaped_slice[i + new_dims_count], + 0LL); + effective_input_extents.push_back(extent); + reshaped_output_dims[i] += SafeInt(reshaped_pad[i]) + reshaped_pad[i + new_dims_count] + reshaped_slice[i] + reshaped_slice[i + new_dims_count]; } + // Compute true output dimensions for (size_t i = 0; i < data_rank; i++) { - output_dims[i] += pads[i] + pads[i + data_rank] + slices[i] + slices[i + data_rank]; + output_dims[i] += SafeInt(pads[i]) + pads[i + data_rank] + slices[i] + slices[i + data_rank]; } - // special case an input with one or more dim values of 0. edge case that is easier to handle - // separately than to complicate all the code for normal usage. + // If the input is empty, but output shape may not be, need padding only + // this is expected for constant mode only, otherwise the output is empty + // no error if (orig_input_shape.Size() == 0) { return PadInputWithDimValueOfZero(ctx, mode, orig_input_shape, output_dims, value); } - TensorShape input_shape(reshaped_input_dims); - SliceIterator input(input_tensor, input_shape, input_starts, input_extents, {}); - - // output_shape need to keep original. + // output_shape needs to keep original. TensorShape output_shape(output_dims); auto& output_tensor = *ctx->Output(0, output_shape); + + const SafeInt total_output_elems(output_shape.Size()); auto* output = reinterpret_cast(output_tensor.MutableDataRaw()); + auto* output_end = output + static_cast(total_output_elems); + OutputSink sink(output, output_end); + + // Early constant-fill: if any effective input extent is zero (input is not empty), no data to copy + // only padding if any for constant mode, for other modes it is an error + const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(), + [](int64_t v) { return v == 0; }); + + if (no_effective_data_to_copy) { + if (mode == Mode::Constant) { + PadAxisConstant(sink, output, value, total_output_elems); + return Status::OK(); + } + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Pad: invalid mode: ", static_cast(mode), " with zero effective input extent"); + } + + // Special case for Reflect mode: ensure all extents >= 2 after slicing + // otherwise reflection is not possible. Matches numpy behavior as ONNX only + // implies that this would be wrong as the start and end positions should be distinct + // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. + if (mode == Mode::Reflect) { + for (size_t i = 0; i < new_dims_count; ++i) { + const int64_t extent = effective_input_extents[i]; // length after slicing + const bool reflect_on_axis = + (reshaped_pad[i] > 0) || (reshaped_pad[i + new_dims_count] > 0); + if (reflect_on_axis && extent < 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Pad reflect requires axis length >= 2 after slicing. Input shape:", + orig_input_shape); + } + } + } TensorPitches output_pitches(reshaped_output_dims); - size_t alignSkip = 0; // Amount to skip to align to where the next input tensor data needs to be written + // Initial skip, sum up the start padding on each axis + SafeInt align_skip = 0; + for (size_t i = 0; i < new_dims_count; i++) { + const auto inc = SafeInt(reshaped_pad[i]) * output_pitches[i]; + align_skip += inc; + } + + // Validate coverage: pre + copy + post == total + SafeInt copy_elems = 1; + for (size_t i = 0, lim = effective_input_extents.size(); i < lim; ++i) { + // All extents are positive here due to the no_data_to_copy check above + copy_elems *= effective_input_extents[i]; + } - // Initial skip, sum up the begin padding on each axis - for (size_t i = 0; i < new_dims_count; i++) - alignSkip += SafeInt(reshaped_pad[i]) * output_pitches[i]; + const size_t prepad_elems = align_skip; + const size_t postpad_elems = SafeInt(total_output_elems) - prepad_elems - copy_elems; + ORT_RETURN_IF_ERROR(PadBase::ValidateTotalElementsCoverage( + total_output_elems, prepad_elems, copy_elems, postpad_elems)); - ExtentAxisCounters input_counters(input_extents); + TensorShape input_shape(reshaped_input_dims); + SliceIterator input(input_tensor, input_shape, input_starts, effective_input_extents, {}); + + ExtentAxisCounters input_counters(effective_input_extents); switch (mode) { case Mode::Constant: @@ -524,28 +605,41 @@ static Status PadImpl(OpKernelContext* ctx, // On loop entry, 'pad' is already set to the first continuous block of padding, and // after every pass through the inner loop it gets set to the next continuous pad size. while (input_counters) { - output += alignSkip; + output += align_skip; { - T* axisStart = output; - output = input.CopyInnermostAxisSolitaryInnerStep(output); - - int64_t prePad = reshaped_pad[inner_axis]; - int64_t postPad = reshaped_pad[inner_axis + new_dims_count]; - PadAxisConstant(axisStart - prePad, value, onnxruntime::narrow(prePad)); - PadAxisConstant(output, value, onnxruntime::narrow(postPad)); - output += postPad; - alignSkip = onnxruntime::narrow(prePad); + T* axis_start = output; + // Compute the actual number of data elements to copy on the innermost axis (after cropping). + const size_t inner_extent = onnxruntime::narrow(effective_input_extents[inner_axis]); + + // Copy innermost block. IMPORTANT: do not rely on the returned 'output' to be end-of-the extent. + ORT_IGNORE_RETURN_VALUE(input.CopyInnermostAxisSolitaryInnerStep(output)); + + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; + if (pre_pad > 0) { + /// Pre-pad(innermost) retro-fill remains valid(write before row_start). + PadAxisConstant(sink, axis_start - static_cast(pre_pad), value, pre_pad); + } + if (post_pad > 0) { + PadAxisConstant(sink, axis_start + inner_extent, value, post_pad); + } + output = axis_start + inner_extent + static_cast(post_pad); + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()]; - int64_t prePad = reshaped_pad[input_counters.Axis()]; - int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count]; - PadAxisConstant(axisStart - prePad * inner_pitch, value, SafeInt(prePad) * inner_pitch); - PadAxisConstant(output, value, SafeInt(postPad) * inner_pitch); - output += inner_pitch * postPad; - alignSkip += inner_pitch * SafeInt(prePad); + T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()]; + const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; + if (pre_pad > 0) { + PadAxisConstant(sink, axis_start - static_cast(pre_pad * inner_pitch), value, pre_pad * inner_pitch); + } + if (post_pad > 0) { + PadAxisConstant(sink, output, value, post_pad * inner_pitch); + } + output += inner_pitch * post_pad; + align_skip += inner_pitch * pre_pad; } } break; @@ -555,35 +649,52 @@ static Status PadImpl(OpKernelContext* ctx, // On loop entry, 'pad' is already set to the first continuous block of padding, and // after every pass through the inner loop it gets set to the next continuous pad size. while (input_counters) { - output += alignSkip; + output += align_skip; { - T* axisStart = output; + const SafeInt inner_extent = effective_input_extents[inner_axis]; + T* axis_start = output; + T* axis_end = axis_start + onnxruntime::narrow(inner_extent); output = input.CopyInnermostAxisSolitaryInnerStep(output); - int64_t prePad = reshaped_pad[inner_axis]; - int64_t postPad = reshaped_pad[inner_axis + new_dims_count]; + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (inner_no_pad_size == 1) { - PadAxisConstant(axisStart - prePad, *axisStart, onnxruntime::narrow(prePad)); - PadAxisConstant(output, *(output - 1), onnxruntime::narrow(postPad)); + if (pre_pad > 0) { + PadAxisConstant(sink, axis_start - static_cast(pre_pad), *axis_start, pre_pad); + } + if (post_pad > 0) { + PadAxisConstant(sink, output, *(output - 1), post_pad); + } } else { // When inner_most axis(es) do not need pad, above PadAxisConstant() do not fit for Edge mode. // Also general loop below after handling first pad axis with non-pad axis works fine. - PadAxis(axisStart - prePad, axisStart, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis])); - PadAxis(output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, onnxruntime::narrow(pads[inner_axis + data_rank])); + if (pads[inner_axis] > 0) { + PadAxis(sink, axis_start - static_cast(pre_pad), axis_start, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); + } + if (pads[inner_axis + data_rank] > 0) { + PadAxis(sink, output, output - inner_no_pad_size, 1, -ptrdiff_t(inner_no_pad_size), inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); + } } - output += postPad; - alignSkip = onnxruntime::narrow(prePad); + output = axis_end + static_cast(post_pad); + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()]; - int64_t prePad = reshaped_pad[input_counters.Axis()]; - int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count]; - PadAxis(axisStart - prePad * inner_pitch, axisStart, 1, -inner_pitch, inner_pitch, onnxruntime::narrow(prePad)); - PadAxis(output, output - inner_pitch, 1, -inner_pitch, inner_pitch, onnxruntime::narrow(postPad)); - output += inner_pitch * postPad; - alignSkip += inner_pitch * SafeInt(prePad); + T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()]; + const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; + if (pre_pad > 0) { + PadAxis(sink, axis_start - static_cast(pre_pad) * inner_pitch, axis_start, 1, -inner_pitch, inner_pitch, + pre_pad); + } + if (post_pad > 0) { + PadAxis(sink, output, output - inner_pitch, 1, -inner_pitch, inner_pitch, post_pad); + } + output += inner_pitch * post_pad; + align_skip += inner_pitch * pre_pad; } } break; @@ -594,97 +705,107 @@ static Status PadImpl(OpKernelContext* ctx, // On loop entry, 'pad' is already set to the first continuous block of padding, and // after every pass through the inner loop it gets set to the next continuous pad size. while (input_counters) { - output += alignSkip; + output += align_skip; { - T* axisStart = output; + T* axis_start = output; output = input.CopyInnermostAxisSolitaryInnerStep(output); - int64_t prePad = reshaped_pad[inner_axis]; - int64_t postPad = reshaped_pad[inner_axis + new_dims_count]; + const SafeInt pre_pad = reshaped_pad[inner_axis]; + const SafeInt post_pad = reshaped_pad[inner_axis + new_dims_count]; if (inner_no_pad_size == 1) { if (mode == Mode::Reflect) { - PadInnermostAxis(axisStart - prePad, axisStart + prePad, -1 /* inputDelta */, onnxruntime::narrow(prePad)); - PadInnermostAxis(output, output - 2, -1 /* inputDelta */, onnxruntime::narrow(postPad)); + if (pre_pad > 0) { + PadInnermostAxis(sink, axis_start - static_cast(pre_pad), + axis_start + static_cast(pre_pad), -1 /* inputDelta */, pre_pad); + } + if (post_pad > 0) { + PadInnermostAxis(sink, output, output - 2, -1 /* inputDelta */, post_pad); + } } else { - PadInnermostAxis(axisStart - prePad, output - prePad, 1 /* inputDelta */, onnxruntime::narrow(prePad)); - PadInnermostAxis(output, axisStart, 1 /* inputDelta */, onnxruntime::narrow(postPad)); + if (pre_pad > 0) { + PadInnermostAxis(sink, axis_start - static_cast(pre_pad), + output - static_cast(pre_pad), 1 /* inputDelta */, pre_pad); + } + if (post_pad > 0) { + PadInnermostAxis(sink, output, axis_start, 1 /* inputDelta */, post_pad); + } } } else { // When inner_most axis(es) do not need pad, Above PadInnermostAxis() do not fit for Reflect mode. if (mode == Mode::Reflect) { - PadAxis( - axisStart - prePad, - axisStart + prePad, - 1, - -ptrdiff_t(inner_no_pad_size * 2), - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis])); - PadAxis( - output, - output - 2 * inner_no_pad_size, - 1, - -ptrdiff_t(inner_no_pad_size * 2), - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis + data_rank])); + PadAxis(sink, + axis_start - static_cast(pre_pad), + axis_start + static_cast(pre_pad), + 1, + -ptrdiff_t(inner_no_pad_size * 2), + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); + PadAxis(sink, + output, + output - 2 * inner_no_pad_size, + 1, + -ptrdiff_t(inner_no_pad_size * 2), + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); } else { - PadAxis( - axisStart - prePad, - output - pads[inner_axis] * inner_no_pad_size, - 1, - 0, - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis])); - PadAxis( - output, - axisStart, - 1, - 0, - inner_no_pad_size, - onnxruntime::narrow(pads[inner_axis + data_rank])); + PadAxis(sink, + axis_start - static_cast(pre_pad), + output - pads[inner_axis] * inner_no_pad_size, + 1, + 0, + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis])); + PadAxis(sink, + output, + axis_start, + 1, + 0, + inner_no_pad_size, + onnxruntime::narrow(pads[inner_axis + data_rank])); } } - output += postPad; - alignSkip = onnxruntime::narrow(prePad); + output += post_pad; + align_skip = pre_pad; } // Calculate the size of the next block of padding (skipping over the innermost axis since that's already done) while (input_counters.Increment()) { ptrdiff_t inner_pitch = onnxruntime::narrow(output_pitches[input_counters.Axis()]); - T* axisStart = output - inner_pitch * input_extents[input_counters.Axis()]; - int64_t prePad = reshaped_pad[input_counters.Axis()]; - int64_t postPad = reshaped_pad[input_counters.Axis() + new_dims_count]; + T* axis_start = output - inner_pitch * effective_input_extents[input_counters.Axis()]; + const SafeInt pre_pad = reshaped_pad[input_counters.Axis()]; + const SafeInt post_pad = reshaped_pad[input_counters.Axis() + new_dims_count]; if (mode == Mode::Reflect) { - PadAxis( - axisStart - prePad * inner_pitch, - axisStart + prePad * inner_pitch, - 1, - -inner_pitch * 2, - inner_pitch, - onnxruntime::narrow(prePad)); - PadAxis( - output, - output - 2 * inner_pitch, - 1, - -inner_pitch * 2, - inner_pitch, - onnxruntime::narrow(postPad)); + PadAxis(sink, + axis_start - static_cast(pre_pad) * inner_pitch, + axis_start + static_cast(pre_pad) * inner_pitch, + 1, + -inner_pitch * 2, + inner_pitch, + pre_pad); + PadAxis(sink, + output, + output - 2 * inner_pitch, + 1, + -inner_pitch * 2, + inner_pitch, + post_pad); } else { - PadAxis( - axisStart - prePad * inner_pitch, - output - prePad * inner_pitch, - 1, - 0, - inner_pitch, - onnxruntime::narrow(prePad)); - PadAxis( - output, - axisStart, - 1, - 0, - inner_pitch, - onnxruntime::narrow(postPad)); + PadAxis(sink, + axis_start - static_cast(pre_pad) * inner_pitch, + output - static_cast(pre_pad) * inner_pitch, + 1, + 0, + inner_pitch, + pre_pad); + PadAxis(sink, + output, + axis_start, + 1, + 0, + inner_pitch, + post_pad); } - output += inner_pitch * postPad; - alignSkip += inner_pitch * SafeInt(prePad); + output += inner_pitch * post_pad; + align_skip += inner_pitch * pre_pad; } } break; diff --git a/onnxruntime/core/providers/cpu/tensor/padbase.h b/onnxruntime/core/providers/cpu/tensor/padbase.h index 43f9cbfc9f9a4..e2ab6ff6c8fb1 100644 --- a/onnxruntime/core/providers/cpu/tensor/padbase.h +++ b/onnxruntime/core/providers/cpu/tensor/padbase.h @@ -67,6 +67,42 @@ class PadBase { // End provider shared + // Only flatten innermost axes when there is no padding and no slicing on ANY axis. + static bool ShouldFlattenInnerShape(gsl::span input_dims, + gsl::span pads, + gsl::span slices) { + const size_t rank = input_dims.size(); + if (rank == 0) return false; + for (size_t i = 0; i < rank; ++i) { + if (slices[i] != 0 || slices[rank + i] != 0) return false; + } + + const size_t inner = rank - 1; + if (pads[inner] != 0 || pads[inner + rank] != 0 || + slices[inner] != 0 || slices[inner + rank] != 0) { + return false; + } + return true; + } + + // Guard: pre-pad + copy + post-pad must equal total output elements. + static Status ValidateTotalElementsCoverage(size_t total_output_elems, + size_t prepad_elems, + size_t copy_elems, + size_t postpad_elems) { + const size_t checked_sum = + SafeInt(prepad_elems) + + SafeInt(copy_elems) + + SafeInt(postpad_elems); + if (checked_sum != total_output_elems) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Pad coverage invalid: pre=", prepad_elems, + " copy=", copy_elems, " post=", postpad_elems, + " total=", total_output_elems); + } + return Status::OK(); + } + /// /// Flatten no padding inner most Axis, so one memcpy cover multiple Axis. /// For example, for a shape of [1,224,224,3] with padding [0,3,3,0,0,3,3,0], can be flatten as diff --git a/onnxruntime/core/providers/cuda/cuda_utils.cu b/onnxruntime/core/providers/cuda/cuda_utils.cu index 934425656e3c9..59f2deda1805e 100644 --- a/onnxruntime/core/providers/cuda/cuda_utils.cu +++ b/onnxruntime/core/providers/cuda/cuda_utils.cu @@ -81,6 +81,7 @@ template std::unique_ptr> CreateConstantOnes(cudaStream_t stream, T * output, T value, int64_t count); SPECIALIZED_FILL(int8_t) +SPECIALIZED_FILL(bool) SPECIALIZED_FILL(int16_t) SPECIALIZED_FILL(int32_t) SPECIALIZED_FILL(int64_t) diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index bdd6567d2ef34..656890e796a1c 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -94,7 +94,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { typedef typename ToCudaType::MappedType CudaT; const auto& input_tensor = *ctx->Input(0); auto const& input_shape = input_tensor.Shape(); - int32_t dimension_count = static_cast(input_shape.NumDimensions()); + const size_t dimension_count = input_shape.NumDimensions(); const PadsVector* p_pads = &pads_; const PadsVector* p_slices = &slices_; @@ -134,26 +134,85 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { TArray input_strides(input_pitches); auto output_dims(input_shape.AsShapeVector()); - ORT_ENFORCE(static_cast(dimension_count) * 2 == p_pads->size(), "'pads' attribute has wrong number of values"); + ORT_ENFORCE(dimension_count * 2 == p_pads->size(), "'pads' attribute has wrong number of values"); // Calculate output dimensions, and handle any negative padding TArray lower_pads(dimension_count); TArray upper_pads(dimension_count); - for (auto i = 0; i < dimension_count; i++) { - lower_pads[i] = (*p_pads)[i] + (*p_slices)[i]; - upper_pads[i] = (*p_pads)[static_cast(i) + dimension_count] + (*p_slices)[static_cast(i) + dimension_count]; - output_dims[i] += lower_pads[i] + upper_pads[i]; + for (size_t i = 0; i < dimension_count; i++) { + lower_pads[i] = SafeInt((*p_pads)[i]) + (*p_slices)[i]; + upper_pads[i] = SafeInt((*p_pads)[i + dimension_count]) + (*p_slices)[i + dimension_count]; + output_dims[i] += SafeInt(lower_pads[i]) + upper_pads[i]; + } + + TensorShapeVector effective_input_extents; + effective_input_extents.reserve(dimension_count); + for (size_t i = 0; i < dimension_count; i++) { + int64_t extent = std::max(SafeInt(input_dims[i]) + + (*p_slices)[i] + (*p_slices)[i + dimension_count], + 0LL); + effective_input_extents.push_back(extent); } TensorShape output_shape(output_dims); + auto& output_tensor = *ctx->Output(0, output_shape); - // special case when there is a dim value of 0 in the shape. behavior depends on mode + // If the input size is zero, but output shape is not, need padding only + // this is expected for constant mode only, otherwise the output is empty + // no error if (input_shape.Size() == 0) { ORT_RETURN_IF_ERROR(PadBase::HandleDimValueZero(mode_, input_shape, output_shape)); + if (mode_ == Mode::Constant) { + const int64_t output_size = output_shape.Size(); + if (output_size > 0) { + Fill(Stream(ctx), reinterpret_cast(output_tensor.MutableData()), value, + output_size); + } + } + // No error for other modes (preserve CPU historical behavior), + // but no output should be expected either + return Status::OK(); } - auto& output_tensor = *ctx->Output(0, output_shape); + // Early constant-fill: input is not empty as above + // However, if any effective input extent is zero, no data to copy + // only padding if any. + const bool no_effective_data_to_copy = std::any_of(effective_input_extents.begin(), effective_input_extents.end(), + [](int64_t v) { return v == 0; }); + + if (no_effective_data_to_copy) { + if (mode_ == Mode::Constant) { + // Attempt to pad constant mode in case output is not empty + // all other modes are an error + const int64_t output_size = output_shape.Size(); + if (output_size > 0) { + Fill(Stream(ctx), reinterpret_cast(output_tensor.MutableData()), value, + output_size); + } + return Status::OK(); + } + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Pad: invalid mode: ", static_cast(mode_), " with zero effective input extent"); + } + + // Special case for Reflect mode: ensure all extents >= 2 after slicing + // otherwise reflection is not possible. Matches numpy behavior as ONNX only + // implies that this would be wrong as the start and end positions should be distinct + // values and with 0 there is not one, and with 1 reflection degenerates into ambiguity. + if (mode_ == Mode::Reflect) { + for (size_t i = 0; i < dimension_count; ++i) { + const int64_t extent = effective_input_extents[i]; // length after slicing + const bool reflect_on_axis = + (*p_pads)[i] > 0 || (*p_pads)[i + dimension_count] > 0; + if (reflect_on_axis && extent < 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Pad reflect requires axis length >= 2 after slicing. Input shape:", + input_shape); + } + } + } + // Case of all pads and slices being zero: just copy input to output if (std::all_of(p_pads->begin(), p_pads->end(), [](const int64_t v) { return v == 0; }) && std::all_of(p_slices->begin(), p_slices->end(), [](const int64_t v) { return v == 0; }) && output_shape.Size() > 0) { @@ -164,7 +223,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); } - if (IsNCHWInputWithPaddingAlongHAndW(static_cast(dimension_count), lower_pads, upper_pads)) { + if (IsNCHWInputWithPaddingAlongHAndW(dimension_count, lower_pads, upper_pads)) { // If we have entered here, it means the input can only be 4-D (NCHW), 3-D (CHW), or 2-D (HW) // NCHW input diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 1d9cd15f53327..49c9d360f9046 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -763,7 +763,7 @@ edge // test handling of input with a 0 for a dimension TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { - // TODO: Unskip when fixed #41968513 + // TODO: Unskip Dml when fixed #41968513 if (DefaultDmlExecutionProvider().get() != nullptr) { GTEST_SKIP() << "Skipping because of the following error: The difference between expected[i] and output[i] is 13, which exceeds threshold"; } @@ -774,49 +774,56 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { {1, 1}, T(1), {2}, - {T(1), T(1)}); + {T(1), T(1)}, + "constant"); RunAllOpsetAllDomainPadTests({0}, // 1D empty pads {}, {0, 0}, T(1), {0}, - {}); + {}, + "constant"); RunAllOpsetAllDomainPadTests({0}, // 1D offsetting pads {}, {-1, 1}, T(1), {0}, - {}); + {}, + "constant"); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, {1, 1, 1, 1}, T(1), {4, 2}, - {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, + "constant"); RunAllOpsetAllDomainPadTests({0, 2}, {}, {1, 1, 1, 1}, T(1), {2, 4}, - {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, + "constant"); RunAllOpsetAllDomainPadTests({0, 2}, {}, {1, 0, 1, 0}, // empty pads for dim 1 T(1), {2, 2}, - {T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1)}, + "constant"); RunAllOpsetAllDomainPadTests({2, 0, 2}, // 3D {}, {0, 1, 0, 0, 1, 0}, T(1), {2, 2, 2}, - {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}); + {T(1), T(1), T(1), T(1), T(1), T(1), T(1), T(1)}, + "constant"); } // Added output shape verification b/w the output shape generated by operator specific ONNX inference and // the output shape generated by operator specific ORT implementation. After adding this verification, @@ -836,11 +843,7 @@ TYPED_TEST(PadOpTest, Pad_Constant_DimWithZeroInput) { // In order to remove the warning, shape inference methods needs to be fixed. TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect."; - } - + // TODO: Enable Dml when fixed #41968513 using T = TypeParam; RunAllOpsetAllDomainPadTests({0}, // 1D {}, @@ -850,7 +853,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {}, "edge", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}", {kTensorrtExecutionProvider}); + "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{0}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -860,7 +864,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {}, "edge", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}", {kTensorrtExecutionProvider}); + "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,0}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, @@ -878,7 +883,8 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { {}, "edge", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}", {kTensorrtExecutionProvider}); + "Cannot use 'edge' mode to pad dimension with a value of 0. Input shape:{2,2,0}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); RunAllOpsetAllDomainPadTests({2, 2, 0}, // 3D {}, @@ -886,24 +892,26 @@ TYPED_TEST(PadOpTest, Pad_Edge_DimWithZeroInput) { T(1), {2, 4, 0}, {}, - "edge"); + "edge", + OpTester::ExpectResult::kExpectSuccess, "", + {kDmlExecutionProvider}); } TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect."; - } - using T = TypeParam; + // DML: Unskip when fixed #41968513 RunAllOpsetAllDomainPadTests({2, 0}, // 2D {}, {1, 0, 1, 0}, // allowed if it doesn't pad the empty dim T(1), {4, 0}, {}, - "reflect"); + "reflect", + OpTester::ExpectResult::kExpectSuccess, + "", + {kDmlExecutionProvider}); + // DML: Unskip when fixed #41968513 RunAllOpsetAllDomainPadTests({0, 2, 1}, // 3D {}, {1, 1, 1, 1, 1, 1}, // not allowed if it pads the empty dim @@ -912,7 +920,8 @@ TYPED_TEST(PadOpTest, Pad_Reflect_DimWithZeroInput) { {}, "reflect", OpTester::ExpectResult::kExpectFailure, - "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", {kTensorrtExecutionProvider}); + "Cannot use 'reflect' mode to pad dimension with a value of 0. Input shape:{0,2,1}", + {kDmlExecutionProvider, kTensorrtExecutionProvider}); } TEST(PadOpTest, BoolType) { @@ -1089,5 +1098,308 @@ TEST(PadOpTest, ConstantPadNegativeAxes) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kNnapiExecutionProvider}); } +TEST(PadOpTest, ConstantFill_F32_RemovesAllDataOnAxis) { + OpTester test("Pad", 18); + test.AddAttribute("mode", "constant"); + + const std::vector input_shape = {1, 1, 4, 4}; + + const std::vector input_data = { + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f}; + + // Calculate expected shape: + // dim0: 1 + 0 + 0 = 1 + // dim1: 1 + 0 + 0 = 1 + // dim2: 4 + -4 + 4 = 4 + // dim3: 4 + 0 + 0 = 4 + const std::vector expected_shape = {1, 1, 4, 4}; + const std::vector expected_data = { + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f}; + + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {8}, {0, 0, -4, 0, 0, 0, 4, 0}, true); + test.AddInput("constant_value", {}, {0.0f}, true); + test.AddOutput("output", expected_shape, expected_data); + test.Run(); +} + +TEST(PadOpTest, ConstantPadLargeNegativePadNoOutput) { + OpTester test("Pad", 18); + test.AddAttribute("mode", "constant"); + + const std::initializer_list input_shape{2, 18, 4}; + + /* clang-format off */ + const std::vector input_data = { + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, + }; + /* clang-format on */ + + // input_data is larger than the shape elements in this test + // constexpr const size_t input_data_size = static_cast(2) * 18 * 4; + // ASSERT_EQ(input_data_size, input_data.size()); + auto input_span = gsl::make_span(input_data.data(), static_cast(2) * 18 * 4); + + const std::initializer_list pads_shape{6}; + std::initializer_list pads = {1, 0x100000, -2, -3, 0, 1}; + ASSERT_EQ(6U, pads.size()); + + // Expected shape is as follows: + // dim0: 2 + 1(pad) - 3(crop at the back) = (0) removed // Should produce empty output + // dim1: 18 + 0x100000(pad) - 0(crop at the front) = 1,048,594 + // dim2: 4 + -2(crop at the front) + 1(pad at the back) = 3 + // Resulting shape is {0, 1048594, 3} with 0 at the front. + // How do we handle zero shapes? Currently ONNX spec allows it. + // We choose to produce a empty tensor + constexpr int64_t dim0 = 2LL + 1 - 3; + constexpr int64_t dim1 = 18LL + 0x100000 - 0; + constexpr int64_t dim2 = 4LL + -2 + 1; + const std::initializer_list output_shape{dim0, dim1, dim2}; + + std::vector output_data; // empty now + + test.AddInput("data", input_shape, input_span); + test.AddInput("pads", pads_shape, pads, true); + test.AddInput("value", {}, {100.f}, true); + + // Omit Axis input + test.AddOutput("output", output_shape, output_data); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + +TEST(PadOpTest, ConstantMode_MixedSigns_Small) { + const std::vector input_shape{2, 6, 4}; + std::vector input_data(2 * 6 * 4); + + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast((i % 5) + 1); + } + + const std::vector pads{1, 3, -2, -1, 0, 1}; + const float cv = 9.0f; + const std::vector expected_shape{2, 9, 3}; + + const std::vector expected_output{ + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 9.f, 9.f, 9.f, + 3.f, 4.f, 9.f, + 2.f, 3.f, 9.f, + 1.f, 2.f, 9.f, + 5.f, 1.f, 9.f, + 4.f, 5.f, 9.f, + 3.f, 4.f, 9.f}; + + ASSERT_EQ(2U * 9U * 3U, expected_output.size()); + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddInput("constant_value", {}, {cv}, true); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "constant"); + test.ConfigExcludeEps({kDmlExecutionProvider}); + test.RunWithConfig(); +} + +TEST(PadOpTest, ConstantMode_InnermostCropThenPostPad) { + const std::vector input_shape{2, 3, 5}; + + std::vector input_data(2 * 3 * 5); + std::iota(input_data.begin(), input_data.end(), 1.0f); + + const std::vector pads{1, 3, -2, -1, 0, 1}; + const float cv = 9.0f; + const std::vector expected_shape{2, 6, 4}; + + const std::vector expected_output{ + // depth 0 + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + + // depth 1 + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 9.0F, 9.0F, 9.0F, 9.0F, + 3.0F, 4.0F, 5.0F, 9.0F, + 8.0F, 9.0F, 10.0F, 9.0F, + 13.0F, 14.0F, 15.0F, 9.0F}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddInput("constant_value", {}, {cv}, true); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "constant"); + test.ConfigExcludeEps({kDmlExecutionProvider}); + test.RunWithConfig(); +} + +TEST(PadOpTest, EdgeMode_ZeroExtentFails) { + std::vector input_shape = {4}; + // Generate input as above + std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + std::vector pads = {-4, 3}; + + const std::vector expected_shape{3}; + const std::vector expected_data = {1.f, 2.f, 3.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "edge"); + test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.Config(OpTester::ExpectResult::kExpectFailure, ""); + test.RunWithConfig(); +} + +TEST(PadOpTest, EdgeMode_ExtentOne_Valid) { + const std::vector input_shape{4}; + const std::vector input_data{1.f, 1.f, 1.f, 1.f}; + const std::vector pads{-3, 3}; + const std::vector expected_shape{4}; + const std::vector expected_output{1.f, 1.f, 1.f, 1.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "edge"); + test.Run(); +} + +TEST(PadOpTest, EdgeMode_FlattenedInnermostAxis) { + // Shape chosen to force FlattenInnerShape(): + // innermost dims {2,4} -> flattened to 8 + const std::vector input_shape = {2, 3, 2, 4}; + + std::vector input_data(2 * 3 * 2 * 4); + for (size_t i = 0; i < input_data.size(); ++i) { + input_data[i] = static_cast(i); + } + + // ONNX pad order: [b0,b1,b2,b3,e0,e1,e2,e3] + // The below shape will cause flattening the last two input dims to 8 + const std::vector pads = { + 0, 0, 0, 0, // begin + 0, 0, 0, 1 // end pad only on last original axis + }; + + // Expected shape: + // flattened axis grows from 8 -> 12 + const std::vector expected_shape = {2, 3, 2, 5}; + + std::vector expected_output = { + // [0][0][0] + 0.f, 1.f, 2.f, 3.f, 3.f, + // [0][0][1] + 4.f, 5.f, 6.f, 7.f, 7.f, + + // [0][1][0] + 8.f, 9.f, 10.f, 11.f, 11.f, + // [0][1][1] + 12.f, 13.f, 14.f, 15.f, 15.f, + + // [0][2][0] + 16.f, 17.f, 18.f, 19.f, 19.f, + // [0][2][1] + 20.f, 21.f, 22.f, 23.f, 23.f, + + // [1][0][0] + 24.f, 25.f, 26.f, 27.f, 27.f, + // [1][0][1] + 28.f, 29.f, 30.f, 31.f, 31.f, + + // [1][1][0] + 32.f, 33.f, 34.f, 35.f, 35.f, + // [1][1][1] + 36.f, 37.f, 38.f, 39.f, 39.f, + + // [1][2][0] + 40.f, 41.f, 42.f, 43.f, 43.f, + // [1][2][1] + 44.f, 45.f, 46.f, 47.f, 47.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddOutput("output", expected_shape, expected_output); + test.AddAttribute("mode", "edge"); + test.Run(); +} + +// Gh issue: https://github.com/microsoft/onnxruntime/issues/11828 +TEST(PadOpTest, Pad_Reflect_NegativeFront_PositiveBack) { + const std::vector input_shape = {4}; + const std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + const std::vector pads = {-3, 3}; + const std::vector expected_shape{4}; + const std::vector expected_data = {2.f, 3.f, 4.f, 1.f}; + + OpTester test("Pad", 18); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "reflect"); + test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, + kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.Config(OpTester::ExpectResult::kExpectFailure, + "Pad reflect requires axis length >= 2 after slicing"); + test.RunWithConfig(); +} + +TEST(PadOpTest, Pad_Wrap_NegativeFront_PositiveBack) { + const std::vector input_shape = {4}; + const std::vector input_data = {1.0f, 2.0f, 3.0f, 4.0f}; + const std::vector pads = {-3, 3}; + + const std::vector expected_shape{4}; + // Post-slice core: [4]; wrap 3 -> [4, 4, 4, 4] + const std::vector expected_data = {4, 4, 4, 4}; + + // CUDA registers only up to 18 and does not impl wrap mode + // so we force version to 19 to automatically exclude EPs that do not + // implement wrap mode similar to the above tests. + OpTester test("Pad", 19); + test.AddInput("data", input_shape, input_data); + test.AddInput("pads", {static_cast(pads.size())}, pads, true); + test.AddOutput("output", expected_shape, expected_data); + test.AddAttribute("mode", "wrap"); + test.ConfigExcludeEps({kDmlExecutionProvider, kQnnExecutionProvider, + kTensorrtExecutionProvider, kWebGpuExecutionProvider}); + test.RunWithConfig(); +} + } // namespace test } // namespace onnxruntime