diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index 203aba2c3dd91..52d4e435badd1 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -25,6 +25,11 @@ constexpr bool Is4BitIntType(int32_t data_type) { (data_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4); } +constexpr bool IsFloatType(int32_t data_type) { + return (data_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT) || + (data_type == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16); +} + // adjust for an optional input/output that has an entry but does not exist int NumActualValues(const Node& node, bool input) { const auto& defs = input ? node.InputDefs() : node.OutputDefs(); @@ -336,38 +341,68 @@ bool ConvNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const std::vector& dq_nodes, const std::vector& q_nodes) const { - if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes)) { + auto is_const_float = [&graph_viewer](const NodeArg* input_def) { + const ONNX_NAMESPACE::TensorProto* initializer = graph_viewer.GetConstantInitializer(input_def->Name()); + return (initializer != nullptr) && IsFloatType(initializer->data_type()); + }; + + const auto& node_inputs = node.InputDefs(); + const bool is_input_const_float = is_const_float(node_inputs[0]); + const bool is_weight_const_float = is_const_float(node_inputs[1]); + const bool has_bias = node_inputs.size() > 2 && node_inputs[2]->Exists(); + const bool is_bias_const_float = has_bias && is_const_float(node_inputs[2]); + + if (is_input_const_float) { + return false; + } + + if (!allow_float_weight_and_bias_ && (is_weight_const_float || is_bias_const_float)) { + return false; + } + + // Check that if an input is not a float initializer, it must come from a DQ. + const int expected_num_dqs = (1 + static_cast(!is_weight_const_float) + + static_cast(has_bias && !is_bias_const_float)); + + if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes, expected_num_dqs)) { return false; } // input and output types need to be same int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); - int32_t dt_weight = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); if (dt_input != dt_output) { return false; } - if (!allow_4bit_weight_ && Is4BitIntType(dt_weight)) { + if (!allow_16bit_ && Is16BitIntType(dt_input)) { return false; } - if (dt_input == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8) { - if (!int8_allowed_ || dt_weight != dt_input) { + // Check quantized weight type. + if (!is_weight_const_float) { + int32_t dt_weight = dq_nodes[1]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); + if (!allow_4bit_weight_ && Is4BitIntType(dt_weight)) { return false; } - } - if (dq_nodes.size() == 3) { // has bias - int32_t dt_bias = dq_nodes[2]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); - if (dt_bias != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32) { + if (dt_input == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8) { + if (!int8_allowed_ || dt_weight != dt_input) { + return false; + } + } + + if (!allow_16bit_ && Is16BitIntType(dt_weight)) { return false; } } - // 16-bit int types must be explicitly allowed. - if (!allow_16bit_ && (Is16BitIntType(dt_input) || Is16BitIntType(dt_weight))) { - return false; + // Check quantized bias (if any) + if (has_bias && !is_bias_const_float) { + int32_t dt_bias = dq_nodes.back()->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type(); + if (dt_bias != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32) { + return false; + } } return true; diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h index 0ba5436e69e81..5f937c91bd1f9 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h @@ -145,8 +145,12 @@ class SplitNodeGroupSelector : public NodeGroupSelector { class ConvNodeGroupSelector : public NodeGroupSelector { public: // default to 'true' - ConvNodeGroupSelector(bool int8_allowed = true, bool allow_16bit = true, bool allow_4bit_weight = true) - : int8_allowed_(int8_allowed), allow_16bit_(allow_16bit), allow_4bit_weight_(allow_4bit_weight) {} + ConvNodeGroupSelector(bool int8_allowed = true, bool allow_16bit = true, bool allow_4bit_weight = true, + bool allow_float_weight_and_bias = true) + : int8_allowed_(int8_allowed), + allow_16bit_(allow_16bit), + allow_4bit_weight_(allow_4bit_weight), + allow_float_weight_and_bias_(allow_float_weight_and_bias) {} private: bool Check(const GraphViewer& graph_viewer, const Node& node, @@ -156,6 +160,7 @@ class ConvNodeGroupSelector : public NodeGroupSelector { bool int8_allowed_; bool allow_16bit_; bool allow_4bit_weight_; + bool allow_float_weight_and_bias_; // EP will have to quantize the weights if necessary. }; class WhereNodeGroupSelector : public NodeGroupSelector { @@ -360,7 +365,8 @@ class ConvSelector : public BaseSelector { public: ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false, gsl::span compatible_providers = {}) - : BaseSelector(std::make_unique(int8_allowed, allow_16bit, allow_4bit_weight), + : BaseSelector(std::make_unique(int8_allowed, allow_16bit, allow_4bit_weight, + /*allow_float_weight_and_bias*/ false), compatible_providers) {} void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override; diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc index ed70111087e19..0e6da8bb490e7 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc @@ -80,6 +80,40 @@ Status BaseOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, return Status::OK(); } +Status BaseOpBuilder::GetBiasQuantParams(const QnnQuantParamsWrapper& input0_qparams, + const QnnQuantParamsWrapper& weight_qparams, + /*out*/ std::vector& bias_scales, + /*out*/ std::vector& bias_offsets, + const logging::Logger& logger) const { + ORT_UNUSED_PARAMETER(logger); + // For now, only handle case where input0 is per-tensor quantized and input1 is either per-tensor + // or per-channel quantized. + ORT_RETURN_IF_NOT(input0_qparams.IsPerTensor(/*include_bw*/ true) && weight_qparams.IsQuantized(), + "QNN EP currently only supports computing bias quantization params for per-tensor ", + "input[0] and per-tensor/per-channel input[1]"); + + // Bias's quantization scale(s) should be the product of the other inputs' quantization scales. + // Input[0] is expected to have one scale (per-tensor). + // If input[1] is per-channel (many scales), then the bias also needs to be per-channel. + std::vector input0_quant_scales; + std::vector weight_quant_scales; + ORT_RETURN_IF_ERROR(input0_qparams.GetScales(input0_quant_scales)); + ORT_RETURN_IF_ERROR(weight_qparams.GetScales(weight_quant_scales)); + + const size_t num_bias_scales_offsets = weight_quant_scales.size(); + assert(input0_quant_scales.size() == 1); // Expected for per-tensor. + ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(), + "Input[1] should have >= 1 quantization scale values"); + + bias_offsets = std::vector(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros. + bias_scales.resize(num_bias_scales_offsets); + for (size_t i = 0; i < num_bias_scales_offsets; i++) { + bias_scales[i] = input0_quant_scales[0] * weight_quant_scales[i]; + } + + return Status::OK(); +} + Status BaseOpBuilder::AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper, const QnnQuantParamsWrapper& input0_qparams, const QnnQuantParamsWrapper& input1_qparams, diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h index 055c0f6ccf2fa..752a5bc6c20c5 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h @@ -95,6 +95,12 @@ class BaseOpBuilder : public IOpBuilder { const logging::Logger& logger, std::vector& input_names) const ORT_MUST_USE_RESULT; + Status GetBiasQuantParams(const QnnQuantParamsWrapper& input0_qparams, + const QnnQuantParamsWrapper& weight_qparams, + /*out*/ std::vector& bias_scales, + /*out*/ std::vector& bias_offsets, + const logging::Logger& logger) const ORT_MUST_USE_RESULT; + Status AddZeroBiasInput(QnnModelWrapper& qnn_model_wrapper, const QnnQuantParamsWrapper& input0_qparams, const QnnQuantParamsWrapper& input1_qparams, diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc index 12887f0fb72d6..087fca1e4706e 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc @@ -190,6 +190,8 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, assert(num_inputs >= 2); // Checked by IsOpSupported. + QnnQuantParamsWrapper weight_qparams; + // // Input 0 // @@ -231,6 +233,26 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str()); } + // Quantize float32 weight to int8_t (per-tensor, symmetric) if necessary. + if (!input_info.quant_param.IsQuantized()) { + ORT_RETURN_IF(input_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT, + "QNN EP only supports unquantized float32 weights"); + + Qnn_DataType_t quant_type = QNN_DATATYPE_SFIXED_POINT_8; // int8_t quantization of input[1] works with input[0] of all types. + std::array weight_scales = {0.0f}; + std::array weight_offsets = {0}; + gsl::span flt_weight = ReinterpretAsSpan(unpacked_tensor); + ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(flt_weight, actual_shape, weight_scales, weight_offsets, + quant_type, /*symmetric*/ true, /*axis*/ std::nullopt)); + + std::vector quant_weight(flt_weight.size()); + ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_weight, actual_shape, weight_scales, weight_offsets, + quant_weight, quant_type)); + unpacked_tensor = std::move(quant_weight); + input_info.qnn_data_type = quant_type; + input_info.quant_param = QnnQuantParamsWrapper(weight_scales[0], weight_offsets[0]); + } + // Transpose quantization parameter's axis if this is using per-channel quantization. if (input_info.quant_param.IsPerChannel()) { std::vector perm; @@ -279,6 +301,7 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, } } + weight_qparams = input_info.quant_param.Copy(); // Store a copy of weight quantization params in case we need to quantize float bias. Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(actual_name); QnnTensorWrapper input_tensorwrapper(actual_name, tensor_type, input_info.qnn_data_type, std::move(input_info.quant_param), @@ -289,9 +312,58 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, // // Input 2: bias // - const bool has_bias_input = num_inputs == 3; + const bool has_bias_input = num_inputs == 3 && inputs[2].node_arg.Exists(); if (has_bias_input) { - ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names)); + TensorInfo bias_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], bias_info)); + + if (!bias_info.quant_param.IsQuantized() && bias_info.is_initializer) { + // Quantize float bias with bias_scale = input0_scale * weight_scale, bias_offset = 0. If weight is per-channel, + // then the bias will be quantized per-channel (axis 0) as well. + ORT_RETURN_IF(bias_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT, + "QNN EP only supports unquantized float32 bias"); + + TensorInfo input0_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info)); + + std::vector bias_scales; + std::vector bias_offsets; + ORT_RETURN_IF_ERROR(GetBiasQuantParams(input0_info.quant_param, weight_qparams, + bias_scales, bias_offsets, logger)); + + size_t num_bias_elems = qnn::utils::ShapeSizeCalc(bias_info.shape, 0, bias_info.shape.size()); + std::vector bias_quant_bytes(num_bias_elems * sizeof(int32_t), 0); + + Qnn_DataType_t bias_quant_type = QNN_DATATYPE_SFIXED_POINT_32; + std::vector flt_bias_bytes; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*bias_info.initializer_tensor, flt_bias_bytes)); + gsl::span flt_bias = ReinterpretAsSpan(flt_bias_bytes); + assert(flt_bias.size() == num_bias_elems); + + std::optional quant_axis; + if (weight_qparams.IsPerChannel()) { + quant_axis = 0; + } + ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_bias, bias_info.shape, bias_scales, bias_offsets, bias_quant_bytes, + bias_quant_type, /*axis*/ quant_axis)); + QnnQuantParamsWrapper bias_qparams; + + if (quant_axis.has_value()) { + bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ static_cast(*quant_axis), + /*is_int4*/ false); + } else { + bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]); + } + + const std::string& bias_name = inputs[2].node_arg.Name(); + auto bias_tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_quant_type, + std::move(bias_qparams), std::move(bias_info.shape), std::move(bias_quant_bytes)); + + qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensor_wrapper)); + input_names.push_back(bias_name); + } else { + ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names)); + } } #if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 16 && QNN_API_VERSION_MINOR <= 18) @@ -325,6 +397,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, const size_t num_inputs = inputs.size(); OnnxConvType conv_type = {}; ORT_RETURN_IF_ERROR(GetOnnxConvType(node_unit.OpType(), conv_type)); + QnnQuantParamsWrapper weight_qparams; assert(num_inputs >= 2); // Checked by IsOpSupported. @@ -460,6 +533,26 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str()); } + // Quantize float32 weight to int8_t (per-tensor, symmetric) if necessary. + if (!input_info.quant_param.IsQuantized()) { + ORT_RETURN_IF(input_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT, + "QNN EP only supports unquantized float32 weights"); + + Qnn_DataType_t quant_type = QNN_DATATYPE_SFIXED_POINT_8; // int8_t quantization of input[1] works with input[0] of all types. + std::array weight_scales = {0.0f}; + std::array weight_offsets = {0}; + gsl::span flt_weight = ReinterpretAsSpan(unpacked_tensor); + ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(flt_weight, final_shape, weight_scales, weight_offsets, + quant_type, /*symmetric*/ true, /*axis*/ std::nullopt)); + + std::vector quant_weight(flt_weight.size()); + ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_weight, final_shape, weight_scales, weight_offsets, + quant_weight, quant_type)); + unpacked_tensor = std::move(quant_weight); + input_info.qnn_data_type = quant_type; + input_info.quant_param = QnnQuantParamsWrapper(weight_scales[0], weight_offsets[0]); + } + // Transpose quantization parameter's axis if this is using per-channel quantization. if (input_info.quant_param.IsPerChannel()) { const std::vector& perm = conv_type == OnnxConvType::kConv ? nchw2hwcn_perm : cnhw2hwcn_perm; @@ -507,6 +600,7 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, } } + weight_qparams = input_info.quant_param.Copy(); // Store a copy of weight quantization params in case we need to quantize float bias. Qnn_TensorType_t tensor_type = qnn_model_wrapper.GetTensorType(conv_weight_input_name); QnnTensorWrapper input_tensorwrapper(conv_weight_input_name, tensor_type, input_info.qnn_data_type, std::move(input_info.quant_param), std::move(final_shape), @@ -518,7 +612,56 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper, // Input 2: bias // if (num_inputs == 3) { - ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names)); + TensorInfo bias_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[2], bias_info)); + + if (!bias_info.quant_param.IsQuantized() && bias_info.is_initializer) { + // Quantize float bias with bias_scale = input0_scale * weight_scale, bias_offset = 0. If weight is per-channel, + // then the bias will be quantized per-channel (axis 0) as well. + ORT_RETURN_IF(bias_info.initializer_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT, + "QNN EP only supports unquantized float32 bias"); + + TensorInfo input0_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[0], input0_info)); + + std::vector bias_scales; + std::vector bias_offsets; + ORT_RETURN_IF_ERROR(GetBiasQuantParams(input0_info.quant_param, weight_qparams, + bias_scales, bias_offsets, logger)); + + size_t num_bias_elems = qnn::utils::ShapeSizeCalc(bias_info.shape, 0, bias_info.shape.size()); + std::vector bias_quant_bytes(num_bias_elems * sizeof(int32_t), 0); + + Qnn_DataType_t bias_quant_type = QNN_DATATYPE_SFIXED_POINT_32; + std::vector flt_bias_bytes; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*bias_info.initializer_tensor, flt_bias_bytes)); + gsl::span flt_bias = ReinterpretAsSpan(flt_bias_bytes); + assert(flt_bias.size() == num_bias_elems); + + std::optional quant_axis; + if (weight_qparams.IsPerChannel()) { + quant_axis = 0; + } + ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(flt_bias, bias_info.shape, bias_scales, bias_offsets, bias_quant_bytes, + bias_quant_type, /*axis*/ quant_axis)); + QnnQuantParamsWrapper bias_qparams; + + if (quant_axis.has_value()) { + bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ static_cast(*quant_axis), + /*is_int4*/ false); + } else { + bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]); + } + + const std::string& bias_name = inputs[2].node_arg.Name(); + auto bias_tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, bias_quant_type, + std::move(bias_qparams), std::move(bias_info.shape), std::move(bias_quant_bytes)); + + qnn_model_wrapper.AddTensorWrapper(std::move(bias_tensor_wrapper)); + input_names.push_back(bias_name); + } else { + ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[2], logger, input_names)); + } } return Status::OK(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc index 813bba8a5952b..307c400892f1d 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.cc @@ -54,37 +54,28 @@ static bool GetQScalarScaleZeroPoint(const QnnModelWrapper& qnn_model_wrapper, } // Computes the floating point range (rmin, rmax) from a QuantizeLinear node's scale/zero-point. -static bool GetQRminRmax(const QnnModelWrapper& qnn_model_wrapper, - const NodeUnit& q_node_unit, - /*out*/ float& rmin, - /*out*/ float& rmax) { - int32_t zp_data_type = ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UNDEFINED; - int32_t zero_point = 0; - float scale = 0.0f; - - if (!GetQScalarScaleZeroPoint(qnn_model_wrapper, q_node_unit, scale, zero_point, zp_data_type)) { - return false; - } - +static bool GetQminQmax(int32_t zp_data_type, + /*out*/ int32_t& qmin, + /*out*/ int32_t& qmax) { switch (zp_data_type) { case ONNX_NAMESPACE::TensorProto_DataType_INT8: { - rmin = scale * (std::numeric_limits::lowest() - zero_point); - rmax = scale * (std::numeric_limits::max() - zero_point); + qmin = std::numeric_limits::lowest(); + qmax = std::numeric_limits::max(); break; } case ONNX_NAMESPACE::TensorProto_DataType_UINT8: { - rmin = scale * (std::numeric_limits::lowest() - zero_point); - rmax = scale * (std::numeric_limits::max() - zero_point); + qmin = std::numeric_limits::lowest(); + qmax = std::numeric_limits::max(); break; } case ONNX_NAMESPACE::TensorProto_DataType_INT16: { - rmin = scale * (std::numeric_limits::lowest() - zero_point); - rmax = scale * (std::numeric_limits::max() - zero_point); + qmin = std::numeric_limits::lowest(); + qmax = std::numeric_limits::max(); break; } case ONNX_NAMESPACE::TensorProto_DataType_UINT16: { - rmin = scale * (std::numeric_limits::lowest() - zero_point); - rmax = scale * (std::numeric_limits::max() - zero_point); + qmin = std::numeric_limits::lowest(); + qmax = std::numeric_limits::max(); break; } default: @@ -100,10 +91,11 @@ static bool CanClipBeRemoved(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& q_node_unit, const logging::Logger& logger) { assert(clip_node_unit.OpType() == "Clip" && q_node_unit.OpType() == QUANTIZE_LINEAR); - float rmin = 0.0f; - float rmax = 0.0f; + int32_t zp_data_type = ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UNDEFINED; + int32_t zero_point = 0; + float scale = 0.0f; - if (!GetQRminRmax(qnn_model_wrapper, q_node_unit, rmin, rmax)) { + if (!GetQScalarScaleZeroPoint(qnn_model_wrapper, q_node_unit, scale, zero_point, zp_data_type)) { return false; } @@ -115,15 +107,19 @@ static bool CanClipBeRemoved(const QnnModelWrapper& qnn_model_wrapper, return false; } - // The clip range must entirely overlap the quantization range (quantization can be smaller). - // Clip range: [------------------] - // Quant range: [-------------] - constexpr float epsilon = std::numeric_limits::epsilon(); - if ((epsilon < clip_min - rmin) || (epsilon < rmax - clip_max)) { + int32_t q_clip_min = static_cast(::rint(clip_min / scale)) + zero_point; + int32_t q_clip_max = static_cast(::rint(clip_max / scale)) + zero_point; + + int32_t data_type_min = 0; + int32_t data_type_max = 0; + if (!GetQminQmax(zp_data_type, data_type_min, data_type_max)) { return false; } - return true; + // The clip range must entirely overlap the quantization range (quantization can be smaller). + // Clip range: [------------------] + // Quant range: [-------------] + return q_clip_min <= data_type_min && q_clip_max >= data_type_max; } // Returns true if the Relu in the sequence (Relu -> Q) can be removed because it is made redundant by the Q. diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index 8d2cb5bdb6da0..3a393fe192ee6 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -497,31 +497,6 @@ std::pair CheckMinMax(float rmin, float rmax) { return std::make_pair(rmin, rmax); } -template -Status GetQminQmax(const Qnn_DataType_t qnn_data_type, - T& qmin, - T& qmax) { - if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { - qmin = static_cast(std::numeric_limits::min()); - qmax = static_cast(std::numeric_limits::max()); - } else { - ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type); - } - return Status::OK(); -} - Status GetQuantParams(float rmin, float rmax, const Qnn_DataType_t qnn_data_type, @@ -535,20 +510,22 @@ Status GetQuantParams(float rmin, rmin = -abs_max; } - float qmin = 0.0f; - float qmax = 255.0f; - ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax)); + double rmin_dbl = static_cast(rmin); + double rmax_dbl = static_cast(rmax); + double qmin = 0.0; + double qmax = 0.0; + ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax, symmetric)); - scale = (rmax - rmin) / (qmax - qmin); - float initial_zero_point = 0.0f; + double scale_dbl = (rmax_dbl - rmin_dbl) / (qmax - qmin); + double initial_zero_point = 0.0; if (symmetric) { - initial_zero_point = std::round(rmin + rmax) / 2; + initial_zero_point = std::round(rmin_dbl + rmax_dbl) / 2; } else { - initial_zero_point = qmin - (rmin / scale); + initial_zero_point = qmin - (rmin_dbl / scale_dbl); } - zero_point = static_cast(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point))); - // To match QNN quantization definition - zero_point = 0 - zero_point; + zero_point = static_cast(RoundHalfToEven(static_cast(Saturate(qmax, qmin, initial_zero_point)))); + zero_point = -zero_point; // Negate to match QNN quantization definition. + scale = static_cast(scale_dbl); return Status::OK(); } @@ -570,6 +547,131 @@ Status Quantize(const double double_value, return Status::OK(); } +size_t ShapeSizeCalc(gsl::span shape, size_t start, size_t end) { + size_t size = 1; + for (size_t i = start; i < end; i++) { + size *= shape[i]; + } + return size; +} + +Status GetDataQuantParams(gsl::span data, gsl::span shape, + /*out*/ gsl::span scales, /*out*/ gsl::span offsets, + Qnn_DataType_t data_type, bool symmetric, std::optional axis) { + const size_t num_dims = shape.size(); + const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims); + ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize"); + + size_t block_count = 1; + size_t broadcast_dim = 1; + size_t block_size = num_elems; + + if (axis.has_value()) { + size_t axis_no_neg = *axis < 0 ? static_cast(*axis) + num_dims : static_cast(*axis); + block_count = ShapeSizeCalc(shape, 0, axis_no_neg); + broadcast_dim = shape[axis_no_neg]; + block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims); + } + + ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer"); + ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer"); + + size_t i = 0; + for (size_t n = 0; n < block_count; n++) { + for (size_t bd = 0; bd < broadcast_dim; bd++) { + float rmin = std::numeric_limits::max(); + float rmax = std::numeric_limits::lowest(); + for (size_t j = 0; j < block_size; j++) { + rmin = std::min(rmin, data[i]); + rmax = std::max(rmax, data[i]); + i++; + } + + scales[bd] = 1.0f; + offsets[bd] = 0; + ORT_RETURN_IF_ERROR(GetQuantParams(rmin, rmax, data_type, scales[bd], offsets[bd], symmetric)); + } + } + + assert(i == data.size()); + return Status::OK(); +} + +Status QuantizeData(gsl::span data, gsl::span shape, + gsl::span scales, gsl::span offsets, + /*out*/ gsl::span quant_bytes, Qnn_DataType_t data_type, + std::optional axis) { + const size_t num_dims = shape.size(); + const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims); + ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize"); + size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size(); + ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes, + "Cannot quantize data because output buffer is not the correct size"); + + size_t block_count = 1; + size_t broadcast_dim = 1; + size_t block_size = num_elems; + + if (axis.has_value()) { + size_t axis_no_neg = *axis < 0 ? static_cast(*axis) + num_dims : static_cast(*axis); + block_count = ShapeSizeCalc(shape, 0, axis_no_neg); + broadcast_dim = shape[axis_no_neg]; + block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims); + } + + ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer"); + ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer"); + + size_t i = 0; + for (size_t n = 0; n < block_count; n++) { + for (size_t bd = 0; bd < broadcast_dim; bd++) { + switch (data_type) { + case QNN_DATATYPE_SFIXED_POINT_8: { + int8_t* output = reinterpret_cast(quant_bytes.data()); + ParQuantizeLinearStd(&data[i], &output[i], block_size, scales[bd], static_cast(-offsets[bd]), nullptr); + break; + } + case QNN_DATATYPE_UFIXED_POINT_8: { + uint8_t* output = reinterpret_cast(quant_bytes.data()); + ParQuantizeLinearStd(&data[i], &output[i], block_size, scales[bd], static_cast(-offsets[bd]), nullptr); + break; + } + case QNN_DATATYPE_SFIXED_POINT_16: { + int16_t* output = reinterpret_cast(quant_bytes.data()); + ParQuantizeLinearStd(&data[i], &output[i], block_size, scales[bd], static_cast(-offsets[bd]), nullptr); + break; + } + case QNN_DATATYPE_UFIXED_POINT_16: { + uint16_t* output = reinterpret_cast(quant_bytes.data()); + ParQuantizeLinearStd(&data[i], &output[i], block_size, scales[bd], static_cast(-offsets[bd]), nullptr); + break; + } + case QNN_DATATYPE_SFIXED_POINT_32: { + const double clip_min = static_cast(std::numeric_limits::min()); + const double clip_max = static_cast(std::numeric_limits::max()); + + int32_t* output = reinterpret_cast(quant_bytes.data()); + for (size_t e = 0; e < block_size; ++e) { + const double scale = static_cast(scales[bd]); + const double offset = static_cast(offsets[bd]); + double float_val = std::nearbyint(static_cast(data[i + e]) / scale) - offset; + float_val = std::max(float_val, clip_min); + float_val = std::min(float_val, clip_max); + output[i + e] = static_cast(float_val); + } + break; + } + default: + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported quantization data type for QuantizeData"); + } + i += block_size; + } + } + assert(i == data.size()); + + return Status::OK(); +} + } // namespace utils } // namespace qnn } // namespace onnxruntime diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index aa4a27460563f..4307749e04635 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -74,7 +74,30 @@ static bool ArrayHasString(const std::array& strings, std:: std::pair CheckMinMax(float rmin, float rmax); template -Status GetQminQmax(const Qnn_DataType_t qnn_data_type, T& qmin, T& qmax); +Status GetQminQmax(const Qnn_DataType_t qnn_data_type, + T& qmin, + T& qmax, + bool symmetric = false) { + if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) { + qmin = static_cast(std::numeric_limits::min() + static_cast(symmetric)); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) { + qmin = static_cast(std::numeric_limits::min()); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) { + qmin = static_cast(std::numeric_limits::min() + static_cast(symmetric)); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { + qmin = static_cast(std::numeric_limits::min()); + qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { + qmin = static_cast(std::numeric_limits::min() + static_cast(symmetric)); + qmax = static_cast(std::numeric_limits::max()); + } else { + ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type); + } + return Status::OK(); +} template inline T Saturate(const T qmax, @@ -104,6 +127,18 @@ Status Quantize(const double double_value, const Qnn_DataType_t qnn_data_type, int& quant_value); +size_t ShapeSizeCalc(gsl::span shape, size_t start, size_t end); + +Status GetDataQuantParams(gsl::span data, gsl::span shape, + /*out*/ gsl::span scales, /*out*/ gsl::span offsets, + Qnn_DataType_t data_type, bool symmetric = false, + std::optional axis = std::nullopt); + +Status QuantizeData(gsl::span data, gsl::span shape, + gsl::span scales, gsl::span offsets, + /*out*/ gsl::span quant_bytes, Qnn_DataType_t data_type, + std::optional axis = std::nullopt); + } // namespace utils } // namespace qnn } // namespace onnxruntime diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index cf37fc00335d3..0130f4ceb7ef1 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -115,11 +115,14 @@ static GetTestQDQModelFn BuildQDQConvTestCase( std::optional group, const std::string& auto_pad = "NOTSET", bool use_contrib_qdq = false, - std::optional output_activation = std::nullopt) { + std::optional output_activation = std::nullopt, + bool quantize_weights = true, + bool quantize_bias = true) { return [conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad, - use_contrib_qdq, output_activation](ModelTestBuilder& builder, - std::vector>& output_qparams) { + use_contrib_qdq, output_activation, + quantize_weights, quantize_bias](ModelTestBuilder& builder, + std::vector>& output_qparams) { std::vector conv_inputs; // input -> Q/DQ -> @@ -129,19 +132,28 @@ static GetTestQDQModelFn BuildQDQConvTestCase( use_contrib_qdq); conv_inputs.push_back(input_qdq); - // weights -> Q/DQ -> auto* weights = MakeTestInput(builder, weights_def); QuantParams weights_qparams = GetTestInputQuantParams(weights_def); - auto* weights_qdq = AddQDQNodePair(builder, weights, weights_qparams.scale, - weights_qparams.zero_point, use_contrib_qdq); - conv_inputs.push_back(weights_qdq); + if (quantize_weights) { + // weights -> Q/DQ -> + auto* weights_qdq = AddQDQNodePair(builder, weights, weights_qparams.scale, + weights_qparams.zero_point, use_contrib_qdq); + conv_inputs.push_back(weights_qdq); + } else { + // Leave weights as float + conv_inputs.push_back(weights); + } // bias -> if (!bias_def.GetShape().empty()) { - // Bias requirement taken from python quantization tool: onnx_quantizer.py::quantize_bias_static() - const float bias_scale = input_qparams.scale * weights_qparams.scale; - - conv_inputs.push_back(MakeTestQDQBiasInput(builder, bias_def, bias_scale, use_contrib_qdq)); + if (quantize_bias) { + // Bias requirement taken from python quantization tool: onnx_quantizer.py::quantize_bias_static() + const float bias_scale = input_qparams.scale * weights_qparams.scale; + conv_inputs.push_back(MakeTestQDQBiasInput(builder, bias_def, bias_scale, use_contrib_qdq)); + } else { + // Leave bias as float + conv_inputs.push_back(MakeTestInput(builder, bias_def)); + } } auto* conv_output = builder.MakeIntermediate(); @@ -191,11 +203,13 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase( std::optional group, const std::string& auto_pad = "NOTSET", bool use_contrib_qdq = false, - std::optional output_activation = std::nullopt) { + std::optional output_activation = std::nullopt, + bool quantize_bias = true) { return [conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad, use_contrib_qdq, - weight_quant_axis, output_activation](ModelTestBuilder& builder, - std::vector>& output_qparams) { + weight_quant_axis, output_activation, + quantize_bias](ModelTestBuilder& builder, + std::vector>& output_qparams) { std::vector conv_inputs; // input -> Q/DQ -> @@ -236,28 +250,32 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase( // Quantized(bias) -> DQ -> if (!bias_def.GetShape().empty()) { - // Bias requirement taken from python quantization tool: onnx_quantizer.py::quantize_bias_static() - // bias_scale = input_scale * weight_scale - // bias_zero_point = 0 - ORT_ENFORCE(bias_def.IsInitializer() && bias_def.IsRawData()); - std::vector bias_scales = weight_scales; - std::vector bias_zero_points(weight_scales.size(), 0); - for (size_t i = 0; i < bias_scales.size(); i++) { - bias_scales[i] *= input_qparams.scale; + if (quantize_bias) { + // Bias requirement taken from python quantization tool: onnx_quantizer.py::quantize_bias_static() + // bias_scale = input_scale * weight_scale + // bias_zero_point = 0 + ORT_ENFORCE(bias_def.IsInitializer() && bias_def.IsRawData()); + std::vector bias_scales = weight_scales; + std::vector bias_zero_points(weight_scales.size(), 0); + for (size_t i = 0; i < bias_scales.size(); i++) { + bias_scales[i] *= input_qparams.scale; + } + + TensorShape bias_shape = bias_def.GetTensorShape(); + std::vector quantized_biases(bias_shape.Size()); + QuantizeValues(bias_def.GetRawData(), quantized_biases, bias_shape, bias_scales, + bias_zero_points, 0); + + NodeArg* bias_initializer = builder.MakeInitializer(bias_def.GetShape(), quantized_biases); + NodeArg* bias_dq = builder.MakeIntermediate(); + Node& bias_dq_node = builder.AddDequantizeLinearNode(bias_initializer, bias_scales, bias_zero_points, + bias_dq, nullptr, use_contrib_qdq); + + bias_dq_node.AddAttribute("axis", static_cast(0)); + conv_inputs.push_back(bias_dq); + } else { + conv_inputs.push_back(MakeTestInput(builder, bias_def)); } - - TensorShape bias_shape = bias_def.GetTensorShape(); - std::vector quantized_biases(bias_shape.Size()); - QuantizeValues(bias_def.GetRawData(), quantized_biases, bias_shape, bias_scales, - bias_zero_points, 0); - - NodeArg* bias_initializer = builder.MakeInitializer(bias_def.GetShape(), quantized_biases); - NodeArg* bias_dq = builder.MakeIntermediate(); - Node& bias_dq_node = builder.AddDequantizeLinearNode(bias_initializer, bias_scales, bias_zero_points, - bias_dq, nullptr, use_contrib_qdq); - - bias_dq_node.AddAttribute("axis", static_cast(0)); - conv_inputs.push_back(bias_dq); } auto* conv_output = builder.MakeIntermediate(); @@ -309,7 +327,9 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef bool use_contrib_qdq = false, int opset = 13, QDQTolerance tolerance = QDQTolerance(), - std::optional output_activation = std::nullopt) { + std::optional output_activation = std::nullopt, + bool quantize_weights = true, + bool quantize_bias = true) { ProviderOptions provider_options; #if defined(_WIN32) @@ -323,7 +343,8 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef BuildQDQConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad, use_contrib_qdq, - output_activation), + output_activation, quantize_weights, + quantize_bias), provider_options, opset, expected_ep_assignment, @@ -346,7 +367,8 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te bool use_contrib_qdq = false, int opset = 13, QDQTolerance tolerance = QDQTolerance(), - std::optional output_activation = std::nullopt) { + std::optional output_activation = std::nullopt, + bool quantize_bias = true) { ProviderOptions provider_options; #if defined(_WIN32) @@ -360,7 +382,8 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te auto qdq_fn = BuildQDQPerChannelConvTestCase(conv_op_type, input_def, weights_def, bias_def, weight_quant_axis, strides, pads, dilations, group, auto_pad, - use_contrib_qdq, output_activation); + use_contrib_qdq, output_activation, + quantize_bias); TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance); } @@ -752,6 +775,138 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_dynamic_input) { QDQTolerance(0.00413f)); } +TEST_F(QnnHTPBackendTests, Conv2D_U8Input0_FloatWeightBias) { + std::vector input_data = GetFloatDataInRange(0.0f, 10.0f, 25); + std::vector weight_data = GetFloatDataInRange(-1.0f, 1.0f, 9); + std::vector bias_data = {2.0f}; + RunHTPConvOpTest("Conv", + TestInputDef({1, 1, 5, 5}, false, input_data), // dynamic input + TestInputDef({1, 1, 3, 3}, true, weight_data), // static input + TestInputDef({1}, true, bias_data), // bias + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_weights*/ false, + /*quantize_bias*/ false); +} + +TEST_F(QnnHTPBackendTests, Conv1D_U8Input0_FloatWeightBias) { + std::vector input_data = GetFloatDataInRange(0.0f, 10.0f, 5); + std::vector weight_data = GetFloatDataInRange(-1.0f, 1.0f, 3); + std::vector bias_data = {2.0f}; + RunHTPConvOpTest("Conv", + TestInputDef({1, 1, 5}, false, input_data), // dynamic input + TestInputDef({1, 1, 3}, true, weight_data), // static input + TestInputDef({1}, true, bias_data), // bias + {1}, // Strides + {0, 0}, // Pads + {1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_weights*/ false, + /*quantize_bias*/ false); +} + +TEST_F(QnnHTPBackendTests, Conv2D_U16Input0_FloatWeightBias) { + std::vector input_data = GetFloatDataInRange(0.0f, 10.0f, 25); + std::vector weight_data = GetFloatDataInRange(-1.0f, 1.0f, 9); + std::vector bias_data = {2.0f}; + RunHTPConvOpTest("Conv", + TestInputDef({1, 1, 5, 5}, false, input_data), // dynamic input + TestInputDef({1, 1, 3, 3}, true, weight_data), // static input + TestInputDef({1}, true, bias_data), // bias + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_weights*/ false, + /*quantize_bias*/ false); +} + +TEST_F(QnnHTPBackendTests, Conv1D_U16Input0_FloatWeightBias) { + std::vector input_data = GetFloatDataInRange(0.0f, 10.0f, 5); + std::vector weight_data = GetFloatDataInRange(-1.0f, 1.0f, 3); + std::vector bias_data = {2.0f}; + RunHTPConvOpTest("Conv", + TestInputDef({1, 1, 5}, false, input_data), // dynamic input + TestInputDef({1, 1, 3}, true, weight_data), // static input + TestInputDef({1}, true, bias_data), // bias + {1}, // Strides + {0, 0}, // Pads + {1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_weights*/ false, + /*quantize_bias*/ false); +} + +TEST_F(QnnHTPBackendTests, Conv2D_U8Input0_S8Weight_FloatBias) { + std::vector input_data = GetFloatDataInRange(0.0f, 10.0f, 25); + std::vector weight_data = GetFloatDataInRange(-1.0f, 1.0f, 9); + std::vector bias_data = {2.0f}; + RunHTPConvOpTest("Conv", + TestInputDef({1, 1, 5, 5}, false, input_data), // dynamic input + TestInputDef({1, 1, 3, 3}, true, weight_data), // static input + TestInputDef({1}, true, bias_data), // bias + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_weights*/ true, + /*quantize_bias*/ false); +} + +TEST_F(QnnHTPBackendTests, Conv1D_U8Input0_S8Weight_FloatBias) { + std::vector input_data = GetFloatDataInRange(0.0f, 10.0f, 5); + std::vector weight_data = GetFloatDataInRange(-1.0f, 1.0f, 3); + std::vector bias_data = {2.0f}; + RunHTPConvOpTest("Conv", + TestInputDef({1, 1, 5}, false, input_data), // dynamic input + TestInputDef({1, 1, 3}, true, weight_data), // static input + TestInputDef({1}, true, bias_data), // bias + {1}, // Strides + {0, 0}, // Pads + {1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_weights*/ true, + /*quantize_bias*/ false); +} + // Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8 TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) { std::vector input_shape = {1, 2, 4, 4}; @@ -780,6 +935,66 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) { 13); // opset } +TEST_F(QnnHTPBackendTests, Conv2D_U8Input0_S8PerChannelWeight_FloatBias) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 2, 2}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + bias_def, + 0, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 19, // opset + QDQTolerance(), + std::nullopt, + /*quantize_bias*/ false); +} + +TEST_F(QnnHTPBackendTests, Conv2D_U16Input0_S8PerChannelWeight_FloatBias) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 2, 2}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-10.0f, 10.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + bias_def, + 0, // weight quant axis + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21, // opset + QDQTolerance(), + std::nullopt, + /*quantize_bias*/ false); +} + // Test per-channel QDQ Conv with INT4 weights. in0: u16, in1 (weight): s4, in2 (bias): s32, out: u8 TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) { std::vector input_shape = {1, 2, 4, 4};