@@ -387,6 +387,61 @@ static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
387
387
return false ;
388
388
}
389
389
390
+ // Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
391
+ static bool IsQDQGraphWithUint16OrInt16 (const onnxruntime::GraphViewer& graph_viewer) {
392
+ std::unordered_set<std::string> qdq_ops = {" QuantizeLinear" , " DequantizeLinear" };
393
+ const auto & node_indices = graph_viewer.GetNodesInTopologicalOrder ();
394
+
395
+ // Check if a NodeArg tensor is 16-bit quantized (UINT16 or INT16)
396
+ auto is_16bit_tensor = [](const onnxruntime::NodeArg* node_arg) -> bool {
397
+ if (!node_arg) return false ;
398
+ const auto * type_proto = node_arg->TypeAsProto ();
399
+ if (type_proto && type_proto->has_tensor_type ()) {
400
+ auto elem_type = type_proto->tensor_type ().elem_type ();
401
+ return (elem_type == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
402
+ elem_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
403
+ }
404
+ return false ;
405
+ };
406
+
407
+ for (size_t i = 0 ; i < node_indices.size (); i++) {
408
+ gsl::not_null<const onnxruntime::Node*> node (graph_viewer.GetNode (node_indices[i]));
409
+
410
+ if (qdq_ops.find (node->OpType ()) != qdq_ops.end ()) {
411
+ const auto & input_defs = node->InputDefs ();
412
+
413
+ if (node->OpType () == " DequantizeLinear" ) {
414
+ // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
415
+ // The quantized input tensor (index 0) determines the quantization type
416
+ if (is_16bit_tensor (input_defs.empty () ? nullptr : input_defs[0 ])) {
417
+ return true ;
418
+ }
419
+
420
+ // Zero point (index 2) must match quantized tensor type per ONNX spec
421
+ // It's optional - absent for INT32 and some float8 types
422
+ if (input_defs.size () >= 3 && is_16bit_tensor (input_defs[2 ])) {
423
+ return true ;
424
+ }
425
+ }
426
+ else if (node->OpType () == " QuantizeLinear" ) {
427
+ // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
428
+ // The quantized output tensor determines the quantization type
429
+ const auto & output_defs = node->OutputDefs ();
430
+ if (is_16bit_tensor (output_defs.empty () ? nullptr : output_defs[0 ])) {
431
+ return true ;
432
+ }
433
+
434
+ // Zero point (index 2) must match quantized tensor type per ONNX spec
435
+ // It's optional - absent for INT32 and some float8 types
436
+ if (input_defs.size () >= 3 && is_16bit_tensor (input_defs[2 ])) {
437
+ return true ;
438
+ }
439
+ }
440
+ }
441
+ }
442
+ return false ;
443
+ }
444
+
390
445
static void DumpOpenVINOEPModel ([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
391
446
[[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
392
447
[[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -445,6 +500,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
445
500
}
446
501
#endif
447
502
503
+ // Check if the graph is QDQ and has int16 or uint16 quantization
504
+ // If so, we will apply the QDQ scales fix transformation (for GPU device only)
505
+ bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16 (subgraph);
506
+
448
507
const auto & onnx_model_path_name = subgraph.ModelPath ();
449
508
// QDQ stripping enabled only for the NPU and experimentally on the GPU
450
509
if ((session_context_.device_type .find (" NPU" ) != std::string::npos) &&
@@ -458,7 +517,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
458
517
ORT_ENFORCE (status.IsOK (), status.ErrorMessage ());
459
518
return model_proto;
460
519
} else if ((session_context_.device_type .find (" GPU" ) != std::string::npos) &&
461
- enable_ovep_qdq_optimizer ) {
520
+ is_qdq_graph_uint16_or_int16 ) {
462
521
// Create a copy of the model
463
522
std::unique_ptr<onnxruntime::Model> model;
464
523
Status status = qdq_scales_fix::Transform (subgraph, logger, model);
0 commit comments