From b94ba09e4fea03288d48f41380d25499cb9b2a7a Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 18 Sep 2024 01:12:16 +0800 Subject: [PATCH] Upgrade XNNPACK to latest version (#22012) ### Description Update XNNPack to latest version (Sep 4) - Some op outputs are changed, channel or stride paras are moved into reshape func. e.g. https://github.com/google/XNNPACK/commit/96962a602d56dc73b345b5b42aabf7a594eceab9 - input params of xnnpack's resize related function are changed a lot - KleidiAI is added as a dependency in ARM64 - The latest XNNPACK includes 2 static libs microkernels-prod and xnnpack. Without microkernels-prod, it throws the exception of Undefined symbols. - Add ORT_TARGET_PROCESSOR to get the real processor target in CMake --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/deps.txt | 4 +- cmake/external/xnnpack.cmake | 62 ++++++++++++++++++- .../xnnpack/AddEmscriptenAndIosSupport.patch | 34 +++++----- .../core/providers/xnnpack/math/softmax.cc | 25 ++++---- .../core/providers/xnnpack/math/softmax.h | 1 + .../core/providers/xnnpack/nn/average_pool.cc | 8 +-- .../core/providers/xnnpack/nn/max_pool.cc | 5 +- .../core/providers/xnnpack/tensor/resize.cc | 13 ++-- .../core/providers/xnnpack/xnnpack_kernel.h | 4 +- .../templates/download-deps.yml | 4 +- 11 files changed, 108 insertions(+), 54 deletions(-) diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index f8589598c7571..654099958b21b 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -146,7 +146,7 @@ "component": { "type": "git", "git": { - "commitHash": "0da379fc4808f9601faef392352018c741c0f297", + "commitHash": "309b75c9e56e0a674bf78d59872ce131f814dfb6", "repositoryUrl": "https://github.com/google/XNNPACK.git" }, "comments": "googlexnnpack" diff --git a/cmake/deps.txt b/cmake/deps.txt index 597c051b5f477..342184bda2f0e 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -29,7 +29,8 @@ fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.5.zip;cd47d3d272faf353600c8cc2fdec2b52d6f69177 google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752 googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349 -googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73 +#xnnpack 2024.09.04 +googlexnnpack;https://github.com/google/XNNPACK/archive/309b75c9e56e0a674bf78d59872ce131f814dfb6.zip;39FA5259EAEACE0547284B63D5CEDC4F05553F5A json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 @@ -60,3 +61,4 @@ composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/arch directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.5.2.zip;11071a47594b20f00af09aad83e0d5203ccf6029 dawn;https://github.com/google/dawn/archive/511eb80847afe6bded34ec491a38d5d78ba2d604.zip;c493f5aca5586f6634e25d0121c85df71189fb99 +kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/v0.2.0/kleidiai-v0.2.0.zip;B1E3173992FD91F20DB904AB77D6E901778C2681 diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake index 41f02ce6f22bc..9519e4e6a7796 100644 --- a/cmake/external/xnnpack.cmake +++ b/cmake/external/xnnpack.cmake @@ -5,6 +5,8 @@ set(FP16_BUILD_TESTS OFF CACHE INTERNAL "") set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "") set(PTHREADPOOL_BUILD_TESTS OFF CACHE INTERNAL "") set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE INTERNAL "") +set(KLEIDIAI_BUILD_TESTS OFF CACHE INTERNAL "") +set(KLEIDIAI_BUILD_BENCHMARK OFF CACHE INTERNAL "") if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*") set(XNNPACK_USE_SYSTEM_LIBS OFF) @@ -30,6 +32,60 @@ set(FXDIV_SOURCE_DIR ${fxdiv_SOURCE_DIR}) FetchContent_Declare(pthreadpool URL ${DEP_URL_pthreadpool} URL_HASH SHA1=${DEP_SHA1_pthreadpool}) onnxruntime_fetchcontent_makeavailable(pthreadpool) +# --- Determine target processor +# Why ORT_TARGET_PROCESSOR is only for XNNPACK +# So far, only Onnxruntime + XNNPack only allow one target processor. +# And we support Mac universal package, so, +# CMAKE_OSX_ARCHITECTURES_COUNT greater than 1 is allowed in other places. +IF(CMAKE_OSX_ARCHITECTURES) + LIST(LENGTH CMAKE_OSX_ARCHITECTURES CMAKE_OSX_ARCHITECTURES_COUNT) + IF(CMAKE_OSX_ARCHITECTURES_COUNT GREATER 1) + MESSAGE(STATUS "Building ONNX Runtime with XNNPACK and multiple OSX architectures is not supported. Got:(${CMAKE_OSX_ARCHITECTURES}). " + "Please specify a single architecture in CMAKE_OSX_ARCHITECTURES and re-configure. ") + ENDIF() + IF(NOT CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64|arm64e|arm64_32)$") + MESSAGE(FATAL_ERROR "Unrecognized CMAKE_OSX_ARCHITECTURES value \"${CMAKE_OSX_ARCHITECTURES}\"") + ENDIF() + SET(ORT_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}") + ADD_COMPILE_OPTIONS("-Wno-shorten-64-to-32") +ELSEIF(CMAKE_GENERATOR MATCHES "^Visual Studio " AND CMAKE_GENERATOR_PLATFORM) + IF(CMAKE_GENERATOR_PLATFORM MATCHES "^Win32") + SET(ORT_TARGET_PROCESSOR "x86") + ELSEIF(CMAKE_GENERATOR_PLATFORM MATCHES "^x64") + SET(ORT_TARGET_PROCESSOR "x86_64") + ELSEIF(CMAKE_GENERATOR_PLATFORM MATCHES "^ARM64") + SET(ORT_TARGET_PROCESSOR "arm64") + ELSEIF(CMAKE_GENERATOR_PLATFORM MATCHES "^ARM64EC") + SET(ORT_TARGET_PROCESSOR "arm64") + ELSE() + MESSAGE(FATAL_ERROR "Unsupported Visual Studio architecture \"${CMAKE_GENERATOR_PLATFORM}\"") + ENDIF() +ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "^i[3-7]86$") + SET(ORT_TARGET_PROCESSOR "x86") +ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64") + SET(ORT_TARGET_PROCESSOR "x86_64") +ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]") + SET(ORT_TARGET_PROCESSOR "arm") +ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") + SET(ORT_TARGET_PROCESSOR "arm64") +ELSEIF(CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") + SET(ORT_TARGET_PROCESSOR "ppc64") +ELSEIF(NOT ORT_TARGET_PROCESSOR MATCHES "^(x86(_64)?|arm64|riscv(32|64|128)|Hexagon|ppc64)$") + SET(ORT_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}") +ELSE() + MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_PROCESSOR value \"${CMAKE_SYSTEM_PROCESSOR}\"") +ENDIF() +MESSAGE(STATUS "Building for ORT_TARGET_PROCESSOR: ${ORT_TARGET_PROCESSOR}") + +# KleidiAI is only used in Arm64 platform and not supported by MSVC, the details can be seen in +# https://github.com/google/XNNPACK/blob/3b3f7b8a6668f6ab3b6ce33b9f1d1fce971549d1/CMakeLists.txt#L206C82-L206C117 +if(ORT_TARGET_PROCESSOR MATCHES "^arm64.*" AND NOT CMAKE_C_COMPILER_ID STREQUAL "MSVC") + FetchContent_Declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai}) + onnxruntime_fetchcontent_makeavailable(kleidiai) + set(KLEIDIAI_SOURCE_DIR ${kleidiai_SOURCE_DIR}) +endif() + + FetchContent_Declare(googlexnnpack URL ${DEP_URL_googlexnnpack} URL_HASH SHA1=${DEP_SHA1_googlexnnpack} PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/xnnpack/AddEmscriptenAndIosSupport.patch ) @@ -37,8 +93,10 @@ onnxruntime_fetchcontent_makeavailable(googlexnnpack) set(XNNPACK_DIR ${googlexnnpack_SOURCE_DIR}) set(XNNPACK_INCLUDE_DIR ${XNNPACK_DIR}/include) -set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK pthreadpool) - +set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK microkernels-prod pthreadpool) +if(ORT_TARGET_PROCESSOR MATCHES "^arm64.*" AND NOT CMAKE_C_COMPILER_ID STREQUAL "MSVC") + list(APPEND onnxruntime_EXTERNAL_LIBRARIES_XNNPACK kleidiai) +endif() # the XNNPACK CMake setup doesn't include the WASM kernels so we have to manually set those up if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") diff --git a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch index 736fffb1e384c..3abf2d3afec42 100644 --- a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch +++ b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch @@ -1,8 +1,8 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index dba9b4687..a4345898d 100755 +index 1ff85b538..c3ef2183f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -122,7 +122,7 @@ ENDIF() +@@ -253,7 +253,7 @@ ENDIF() # ---[ Build flags IF(NOT CMAKE_SYSTEM_NAME) MESSAGE(FATAL_ERROR "CMAKE_SYSTEM_NAME not defined") @@ -11,29 +11,27 @@ index dba9b4687..a4345898d 100755 MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME value \"${CMAKE_SYSTEM_NAME}\"") ENDIF() IF(CMAKE_SYSTEM_NAME MATCHES "Windows") -@@ -534,7 +534,12 @@ IF(XNNPACK_BUILD_LIBRARY) - TARGET_LINK_LIBRARIES(operator-utils PRIVATE logging) - TARGET_LINK_LIBRARIES(post-operation PRIVATE logging) - TARGET_LINK_LIBRARIES(subgraph PRIVATE allocator logging memory mutex operators operator-run) -- TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation microkernels-prod subgraph) +@@ -763,7 +763,12 @@ IF(XNNPACK_BUILD_LIBRARY) + TARGET_LINK_LIBRARIES(operator-run PRIVATE xnnpack-base logging) + TARGET_LINK_LIBRARIES(operator-utils PRIVATE xnnpack-base logging) + TARGET_LINK_LIBRARIES(subgraph PRIVATE xnnpack-base allocator logging memory mutex operators operator-run) +- TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph) + IF(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") -+ # omit microkernels-prod as the list is manually created by ORT in cmake/external/xnnpack.cmake -+ TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation subgraph) ++ # omit microkernels-prod as the list is manually created by ORT in cmake/external/xnnpack.cmake ++ TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing subgraph) + ELSE() -+ TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation microkernels-prod subgraph) -+ ENDIF() ++ TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph) ++ ENDIF() + TARGET_LINK_LIBRARIES(XNNPACK PUBLIC xnnpack-base) SET_TARGET_PROPERTIES(XNNPACK PROPERTIES C_EXTENSIONS YES) ENDIF() - IF(NOT MSVC) -@@ -543,8 +548,9 @@ ENDIF() +@@ -772,7 +777,8 @@ IF(NOT MSVC) + ENDIF() IF(XNNPACK_TARGET_PROCESSOR STREQUAL "arm") SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -marm ") - SET_PROPERTY(SOURCE ${PROD_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -marm ") - SET_PROPERTY(SOURCE ${ALL_ARMSIMD32_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv6 -mfpu=vfp -munaligned-access ") -- SET_PROPERTY(SOURCE ${PROD_ARMSIMD32_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv6 -mfpu=vfp -munaligned-access ") + # set this to armv7-a to workaround build issue. we don't target armv6 so it shouldn't matter -+ SET_PROPERTY(SOURCE ${ALL_ARMSIMD32_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=vfp -munaligned-access ") -+ SET_PROPERTY(SOURCE ${PROD_ARMSIMD32_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=vfp -munaligned-access ") ++ SET_PROPERTY(SOURCE ${ALL_ARMSIMD32_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=vfp -munaligned-access ") SET_PROPERTY(SOURCE ${ALL_NEON_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=neon ") - SET_PROPERTY(SOURCE ${PROD_NEON_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=neon ") SET_PROPERTY(SOURCE ${ALL_NEONFP16_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -march=armv7-a -mfpu=neon-fp16 ") + # GCC requires -mfp16-format=ieee to define __fp16 type, but Clang doesn't support this option at all. diff --git a/onnxruntime/core/providers/xnnpack/math/softmax.cc b/onnxruntime/core/providers/xnnpack/math/softmax.cc index 87440b7814176..43e3ac193de5d 100644 --- a/onnxruntime/core/providers/xnnpack/math/softmax.cc +++ b/onnxruntime/core/providers/xnnpack/math/softmax.cc @@ -166,24 +166,21 @@ Softmax::Softmax(const OpKernelInfo& info) : XnnpackKernel{info} { if (op_type_ == OpComputeType::op_compute_type_qu8) { // the order of input tensor, x,x_scale, x_zp, y_scale, y_zp OpQuantParam quant_param = ParseQuantParamForOp(info, x_dtype, 1); - xstatus = xnn_create_softmax_nc_qu8(channels, - channels, - channels, - quant_param[0].first[0], // x_scale - quant_param[1].second, // y_zp - quant_param[1].first[0], // y_scale - 0, // flags, - &p); + xstatus = xnn_create_softmax_nc_qu8( + quant_param[0].first[0], // x_scale, input scale + quant_param[1].second, // y_zp, output zero point + quant_param[1].first[0], // y_scale, output scale + 0, // flags, + &p); } else if (op_type_ == OpComputeType::op_compute_type_fp32) { - xstatus = xnn_create_softmax_nc_f32(channels, - channels, - channels, - 0, // flags, - &p); + xstatus = xnn_create_softmax_nc_f32( + 0, // flags, + &p); } ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_softmax_nc_", OpTypeToString(op_type_), " failed. Status:", xstatus); + channel_dim_ = channels; op0_.reset(p); } @@ -205,7 +202,7 @@ Status Softmax::Compute(OpKernelContext* ctx) const { auto reshape_fn = op_type_ == OpComputeType::op_compute_type_qu8 ? xnn_reshape_softmax_nc_qu8 : xnn_reshape_softmax_nc_f32; - status = reshape_fn(op0_.get(), N, threadpool); + status = reshape_fn(op0_.get(), channel_dim_, channel_dim_, channel_dim_, N, threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_softmax_nc_", OpTypeToString(op_type_), diff --git a/onnxruntime/core/providers/xnnpack/math/softmax.h b/onnxruntime/core/providers/xnnpack/math/softmax.h index 8c6fba6c822a1..9a8055ff34a57 100644 --- a/onnxruntime/core/providers/xnnpack/math/softmax.h +++ b/onnxruntime/core/providers/xnnpack/math/softmax.h @@ -23,6 +23,7 @@ class Softmax final : public XnnpackKernel { int opset_; OpComputeType op_type_ = OpComputeType::op_compute_type_invalid; XnnpackOperator op0_; + int64_t channel_dim_; }; } // namespace xnnpack } // namespace onnxruntime diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc index 58c209a13cd0c..b31b5a94899bf 100644 --- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc @@ -15,7 +15,6 @@ namespace onnxruntime { namespace xnnpack { namespace { Status CreateXnnpackKernel(const PoolAttributes& pool_attrs, - int64_t C, const std::optional>& clip_min_max, struct xnn_operator*& p, const OpQuantParam& quant_param, @@ -42,7 +41,6 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs, input_padding_bottom, input_padding_left, pooling_height, pooling_width, stride_height, stride_width, - C, C, C, // channels, input_pixel_stride, output_pixel_stride foutput_min, foutput_max, flags, &p); } else if (avgpool_type == OpComputeType::op_compute_type_qu8) { const float output_scale = quant_param[1].first[0]; @@ -53,7 +51,6 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs, input_padding_bottom, input_padding_left, pooling_height, pooling_width, stride_height, stride_width, - C, C, C, // channels, input_pixel_stride, output_pixel_stride quant_param[0].second, quant_param[0].first[0], quant_param[1].second, @@ -209,7 +206,7 @@ AveragePool::AveragePool(const OpKernelInfo& info) ORT_THROW("unsupported AveragePool in XnnpackEP, we have FLOAT|UINT8, but got ", stype); } struct xnn_operator* p; - auto ret = CreateXnnpackKernel(pool_attrs_, C, clip_min_max_, p, + auto ret = CreateXnnpackKernel(pool_attrs_, clip_min_max_, p, quant_param, avgpool_type_); ORT_ENFORCE(ret.IsOK(), ret.ErrorMessage()); op0_.reset(p); @@ -222,6 +219,7 @@ Status AveragePool::Compute(OpKernelContext* context) const { int64_t N = X_shape[0]; int64_t H = X_shape[1]; int64_t W = X_shape[2]; + int64_t C = X_shape[3]; // set the N dim to the correct value TensorShapeVector output_dims{output_dims_}; @@ -247,7 +245,7 @@ Status AveragePool::Compute(OpKernelContext* context) const { ? xnn_reshape_average_pooling2d_nhwc_f32 : xnn_reshape_average_pooling2d_nhwc_qu8; - auto status = reshape_fn(op0_.get(), N, H, W, + auto status = reshape_fn(op0_.get(), N, H, W, C, C, C, &workspace_size, &workspace_alignment, /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, threadpool); diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 2ef9f97f77b14..0f0b827974f66 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -172,7 +172,6 @@ MaxPool::MaxPool(const OpKernelInfo& info) pooling_height, pooling_width, stride_height, stride_width, dilation_height, dilation_width, - C, C, C, // channels, input_pixel_stride, output_pixel_stride foutput_min, foutput_max, flags, &p); } else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) { maxpool_type_ = OpComputeType::op_compute_type_qu8; @@ -183,7 +182,6 @@ MaxPool::MaxPool(const OpKernelInfo& info) pooling_height, pooling_width, stride_height, stride_width, dilation_height, dilation_width, - C, C, C, // channels, input_pixel_stride, output_pixel_stride output_min, output_max, flags, &p); } else if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_INT8) { maxpool_type_ = OpComputeType::op_compute_type_qs8; @@ -194,7 +192,6 @@ MaxPool::MaxPool(const OpKernelInfo& info) pooling_height, pooling_width, stride_height, stride_width, dilation_height, dilation_width, - C, C, C, // channels, input_pixel_stride, output_pixel_stride output_min, output_max, flags, &p); } else { auto stype = DataTypeImpl::ToString(DataTypeImpl::TypeFromProto(*X_arg.TypeAsProto())); @@ -213,6 +210,7 @@ Status MaxPool::Compute(OpKernelContext* context) const { int64_t N = X_shape[0]; int64_t H = X_shape[1]; int64_t W = X_shape[2]; + int64_t C = X_shape[3]; // set the N dim to the correct value TensorShapeVector output_dims{output_dims_}; @@ -234,6 +232,7 @@ Status MaxPool::Compute(OpKernelContext* context) const { } auto status = reshape_fn(op0_.get(), N, H, W, + C, C, C, // channels, input_pixel_stride, output_pixel_stride /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, threadpool); if (status != xnn_status_success) { diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc index cf874796ba169..db5648d5d6e54 100644 --- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc +++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc @@ -214,8 +214,6 @@ Resize::Resize(const OpKernelInfo& info) : UpsampleBase(info), XnnpackKernel{inf } } - int64_t channels = x_shape->dim(3).dim_value(); - uint32_t flags = 0; ORT_ENFORCE(mode_ == UpsampleMode::LINEAR, "only support bilinear resize"); if (coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS) { @@ -227,12 +225,14 @@ Resize::Resize(const OpKernelInfo& info) : UpsampleBase(info), XnnpackKernel{inf xnn_status xstatus = xnn_status_invalid_state; struct xnn_operator* p = nullptr; + auto out_h = output_dims_[1]; + auto out_w = output_dims_[2]; if (op_type_ == OpComputeType::op_compute_type_fp32) { - xstatus = xnn_create_resize_bilinear2d_nhwc_f32(channels, channels, channels, flags, &p); + xstatus = xnn_create_resize_bilinear2d_nhwc_f32(out_h, out_w, flags, &p); } else if (op_type_ == OpComputeType::op_compute_type_qu8) { - xstatus = xnn_create_resize_bilinear2d_nhwc_u8(channels, channels, channels, flags, &p); + xstatus = xnn_create_resize_bilinear2d_nhwc_u8(out_h, out_w, flags, &p); } else { - xstatus = xnn_create_resize_bilinear2d_nhwc_s8(channels, channels, channels, flags, &p); + xstatus = xnn_create_resize_bilinear2d_nhwc_s8(out_h, out_w, flags, &p); } ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_resize_bilinear2d_nhwc_", OpTypeToString(op_type_), " failed. Status:", @@ -248,6 +248,7 @@ Status Resize::ComputeInternal(OpKernelContext* ctx, const Tensor* input, auto N = X_shape[0]; auto H = X_shape[1]; auto W = X_shape[2]; + auto C = X_shape[3]; Tensor* output = ctx->Output(0, TensorShape(output_dims)); pthreadpool_t threadpool = GetThreadPool(); @@ -266,7 +267,7 @@ Status Resize::ComputeInternal(OpKernelContext* ctx, const Tensor* input, reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_s8; } - auto status = reshape_fn(op0_.get(), N, H, W, output_dims[1], output_dims[2], + auto status = reshape_fn(op0_.get(), N, H, W, C, C, C, &workspace_size, &workspace_alignment, threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_resize_bilinear2d_nhwc_", OpTypeToString(op_type_), diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h b/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h index 0978a88288114..31512586be19d 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h @@ -48,7 +48,7 @@ class XnnpackKernel : public OpKernel { // auto_code_cache.reset(&code_cache_); #endif // status = xnn_init_weights_cache(&weights_cache_); - xnn_weights_cache_t weights_cache = nullptr; + xnn_weights_cache_t weights_cache_provider = nullptr; status = xnn_create_weights_cache(&weights_cache, 0); ORT_ENFORCE(status == xnn_status_success, "Failed to create XNNPACK weights cache"); auto_weights_cache.reset(weights_cache); @@ -57,7 +57,7 @@ class XnnpackKernel : public OpKernel { } // std::unique_ptr auto_code_cache; - std::unique_ptr auto_weights_cache; + std::unique_ptr auto_weights_cache; // private: // #if defined(XNN_CACHE_ENABLE) && XNN_PLATFORM_JIT diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 3c74c70ed102d..cbba1cb8ba8bd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.181 + version: 1.0.184 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.181 + version: 1.0.184 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here.