add: detect Q/DQ with int16/uint16 initializers

ankitm3k · ankitm3k · commit fbf966abd0f6 · 2025-08-04T17:02:11.000+05:30
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -387,6 +387,61 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
+// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
+static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) {
+  std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+
+  // Check if a NodeArg tensor is 16-bit quantized (UINT16 or INT16)
+  auto is_16bit_tensor = [](const onnxruntime::NodeArg* node_arg) -> bool {
+    if (!node_arg) return false;
+    const auto* type_proto = node_arg->TypeAsProto();
+    if (type_proto && type_proto->has_tensor_type()) {
+      auto elem_type = type_proto->tensor_type().elem_type();
+      return (elem_type == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
+              elem_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
+    }
+    return false;
+  };
+
+  for (size_t i = 0; i < node_indices.size(); i++) {
+    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
+
+    if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
+      const auto& input_defs = node->InputDefs();
+
+      if (node->OpType() == "DequantizeLinear") {
+        // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
+        // The quantized input tensor (index 0) determines the quantization type
+        if (is_16bit_tensor(input_defs.empty() ? nullptr : input_defs[0])) {
+          return true;
+        }
+
+        // Zero point (index 2) must match quantized tensor type per ONNX spec
+        // It's optional - absent for INT32 and some float8 types
+        if (input_defs.size() >= 3 && is_16bit_tensor(input_defs[2])) {
+          return true;
+        }
+      }
+      else if (node->OpType() == "QuantizeLinear") {
+        // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
+        // The quantized output tensor determines the quantization type
+        const auto& output_defs = node->OutputDefs();
+        if (is_16bit_tensor(output_defs.empty() ? nullptr : output_defs[0])) {
+          return true;
+        }
+
+        // Zero point (index 2) must match quantized tensor type per ONNX spec
+        // It's optional - absent for INT32 and some float8 types
+        if (input_defs.size() >= 3 && is_16bit_tensor(input_defs[2])) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
                                 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
                                 [[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -445,6 +500,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
   }
 #endif
 
+  // Check if the graph is QDQ and has int16 or uint16 quantization
+  // If so, we will apply the QDQ scales fix transformation (for GPU device only)
+  bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
+
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
   if ((session_context_.device_type.find("NPU") != std::string::npos) &&
@@ -458,7 +517,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
   } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-             enable_ovep_qdq_optimizer) {
+             is_qdq_graph_uint16_or_int16) {
     // Create a copy of the model
     std::unique_ptr<onnxruntime::Model> model;
     Status status = qdq_scales_fix::Transform(subgraph, logger, model);