diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index cadeab4cbd4cc..2af414bd359bf 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -387,6 +387,44 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) { return false; } +static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) { + const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr; + return type_proto && type_proto->has_tensor_type() && + (type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 || + type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16); +} + +// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization +static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) { + std::unordered_set qdq_ops = {"QuantizeLinear", "DequantizeLinear"}; + const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); + + for (size_t i = 0; i < node_indices.size(); i++) { + gsl::not_null node(graph_viewer.GetNode(node_indices[i])); + + if (qdq_ops.find(node->OpType()) != qdq_ops.end()) { + const auto& input_defs = node->InputDefs(); + + if (node->OpType() == "DequantizeLinear") { + // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output] + // Check quantized input tensor and optional zero point + if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) || + (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) { + return true; + } + } else if (node->OpType() == "QuantizeLinear") { + // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output] + const auto& output_defs = node->OutputDefs(); + if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) || + (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) { + return true; + } + } + } + } + return false; +} + static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name, [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto, [[maybe_unused]] const onnxruntime::Node& fused_node) { @@ -445,6 +483,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, } #endif + // Check if the graph is QDQ and has int16 or uint16 quantization + // If so, we will apply the QDQ scales fix transformation (for GPU device only) + bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph); + const auto& onnx_model_path_name = subgraph.ModelPath(); // QDQ stripping enabled only for the NPU and experimentally on the GPU if ((session_context_.device_type.find("NPU") != std::string::npos) && @@ -458,7 +500,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else if ((session_context_.device_type.find("GPU") != std::string::npos) && - enable_ovep_qdq_optimizer) { + is_qdq_graph_uint16_or_int16) { // Create a copy of the model std::unique_ptr model; Status status = qdq_scales_fix::Transform(subgraph, logger, model); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index f991e85ebe518..3b25d67b6b376 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -557,7 +557,9 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { auto dtype = type_proto->tensor_type().elem_type(); // Enable bfloat16 -> float16 on-the-fly conversion - if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16) + if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16 || + dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || + dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16) return true; if (is_initializer) { for (auto const& var : supported_types_initializer_) { @@ -610,9 +612,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { (var.second == dtype)) { return true; } - // experimentally for GPU and qdq stripping mode allow int16 types - if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)) - return true; } #ifndef NDEBUG if (openvino_ep::backend_utils::IsDebugEnabled()) { diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index f1ce230387565..3a39152b5d17d 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -3,6 +3,7 @@ #include "qdq_scales_fix.h" #include "core/providers/openvino/ov_protobuf_utils.h" +#include "core/framework/ort_value.h" #include "core/framework/float16.h" #include @@ -904,22 +905,11 @@ Status copy_model(const GraphViewer& src_graph_viewer, } for (auto& [name, tensor_proto] : src_graph.GetAllInitializedTensors()) { - dst_graph.AddInitializedTensor(*tensor_proto); - } - - for (auto node_arg : src_graph.GetInputsIncludingInitializers()) { - auto check_inputs = [node_arg](auto input_node_arg) { - return input_node_arg->Name() == node_arg->Name(); - }; - if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end()) - continue; - - auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true); - if (src_tensor_proto) { - auto dst_tensor_proto = onnx::TensorProto::Create(); - dst_tensor_proto->copy_from(src_tensor_proto); - dst_graph.AddInitializedTensor(*dst_tensor_proto); - } + auto ort_value = OrtValue(); + if (src_graph.GetOrtValueInitializer(name, ort_value)) + ORT_RETURN_IF_ERROR(dst_graph.AddInitializedOrtValue(*tensor_proto, ort_value)); + else + dst_graph.AddInitializedTensor(*tensor_proto); } ORT_RETURN_IF_ERROR(dst_graph.Resolve());