From 02af2d9b808aa66510cf407b87673e5348731c56 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Fri, 6 Sep 2024 18:25:46 -0700 Subject: [PATCH 01/10] Prevent int32 quantized bias from clipping by adjusting the weight's scale --- .../tools/quantization/base_quantizer.py | 4 +- .../tools/quantization/qdq_quantizer.py | 78 ++++++++++++++++++- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index b20af5137d206..e8ce9c7224e1b 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -230,8 +230,8 @@ def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1 # TODO: This formula should be explained including why the scale is not estimated for the bias as well. bias_scale = input_scale * weight_scale * beta - quantized_data = (np.asarray(bias_data) / bias_scale).round() - quantized_data = np.clip(quantized_data, np.iinfo(np.int32).min, np.iinfo(np.int32).max) + quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64) + quantized_data = np.clip(quantized_data.round(), np.iinfo(np.int32).min, np.iinfo(np.int32).max) quantized_data = quantized_data.astype(np.int32) # update bias initializer diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index b71f332252850..2069889a467c4 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1029,6 +1029,68 @@ def quantize_weight_per_channel( return q_weight_name, zp_name, scale_name + def adjust_other_input_scales_for_int32_bias( + self, + input_scale_tp: onnx.TensorProto, + weight_scale_tp: onnx.TensorProto, + bias_tp: onnx.TensorProto, + beta: float, + ) -> tuple[np.ndarray, np.ndarray]: + """ + Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. + A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to + be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be + increased to prevent this from happening. + """ + input_scale: np.ndarray = tensor_proto_to_array(input_scale_tp) + weight_scale: np.ndarray = tensor_proto_to_array(weight_scale_tp) + bias_float_data: np.ndarray = tensor_proto_to_array(bias_tp) + assert bias_float_data.size > 0, "Expect bias input to have some data" + + # Check the shape of the weight's scale to determine if using per-channel or per-tensor quantization. + weight_scale_rank: int = len(weight_scale.shape) + is_per_tensor: bool = weight_scale_rank == 0 or (weight_scale_rank == 1 and weight_scale.shape[0] == 1) + + qmin = np.asarray(np.iinfo(np.int32).min) + qmax = np.asarray(np.iinfo(np.int32).max) + + if is_per_tensor: + _, bias_true_scale = compute_scale_zp( + rmin=bias_float_data.min(), rmax=bias_float_data.max(), qmin=qmin, qmax=qmax, symmetric=True + ) + bias_true_scale = np.asarray(bias_true_scale, dtype=np.float64) + bias_candidate_scale = np.asarray(input_scale * weight_scale * beta, dtype=np.float64) + ratio = bias_true_scale / bias_candidate_scale + + if ratio > 1.0: + # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + weight_scale *= np.asarray(ratio, dtype=weight_scale.dtype) + weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) + else: + assert weight_scale_rank == 1, "per-channel scales should be a 1D tensor" + assert len(bias_float_data.shape) == 1, "bias should be a 1D tensor for per-channel quant" + + num_elems = weight_scale.shape[0] + assert num_elems == bias_float_data.shape[0], "Bias shape should match per-channel weight scale's shape" + + updated_an_elem = False + + for i in range(num_elems): + bias_rmax = np.abs(bias_float_data[i]) + bias_true_scale = (2.0 * bias_rmax) / (np.float64(qmax) - np.float64(qmin)) + bias_candidate_scale = np.asarray(input_scale * weight_scale[i] * beta, dtype=np.float64) + ratio = bias_true_scale / bias_candidate_scale + + if ratio > 1.0: + # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + weight_scale[i] *= np.asarray(ratio, dtype=weight_scale.dtype) + updated_an_elem = True + + if updated_an_elem: + weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) + + return (input_scale, weight_scale) + def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str: """ Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale @@ -1038,17 +1100,25 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s if bias_name in self.quantized_value_map: return self.quantized_value_map[bias_name].original.q_name + bias_initializer = find_by_name(bias_name, self.model.initializer()) + assert bias_initializer is not None, "Bias should be a weight" + # get scale for weight weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name - weight_initializer = find_by_name(weight_scale_name, self.model.initializer()) - weight_scale = tensor_proto_to_array(weight_initializer) + weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer()) # get scale for input input_scale_name = ( self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name ) - inputscale_initializer = find_by_name(input_scale_name, self.model.initializer()) - input_scale = tensor_proto_to_array(inputscale_initializer) + input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) + + input_scale, weight_scale = self.adjust_other_input_scales_for_int32_bias( + input_scale_initializer, + weight_scale_initializer, + bias_initializer, + bias_info.beta, + ) ( quantized_bias_name, From 4032320f408724efc24b4d1f4047e461d4490131 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Mon, 9 Sep 2024 00:23:03 -0700 Subject: [PATCH 02/10] Dont adjust for bias quantized to float8. Ensure tiny scales are not set to 1.0 when trying to compare magnitudes --- .../tools/quantization/qdq_quantizer.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 2069889a467c4..83717e0fe9520 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1029,7 +1029,7 @@ def quantize_weight_per_channel( return q_weight_name, zp_name, scale_name - def adjust_other_input_scales_for_int32_bias( + def adjust_weight_scale_for_int32_bias( self, input_scale_tp: onnx.TensorProto, weight_scale_tp: onnx.TensorProto, @@ -1055,8 +1055,14 @@ def adjust_other_input_scales_for_int32_bias( qmax = np.asarray(np.iinfo(np.int32).max) if is_per_tensor: + tiny_float = np.finfo(bias_float_data.dtype).tiny _, bias_true_scale = compute_scale_zp( - rmin=bias_float_data.min(), rmax=bias_float_data.max(), qmin=qmin, qmax=qmax, symmetric=True + rmin=bias_float_data.min(), + rmax=bias_float_data.max(), + qmin=qmin, + qmax=qmax, + symmetric=True, + min_real_range=tiny_float, # Prevent compute_scale_zp() from using a scale of 1.0 for tiny scales ) bias_true_scale = np.asarray(bias_true_scale, dtype=np.float64) bias_candidate_scale = np.asarray(input_scale * weight_scale * beta, dtype=np.float64) @@ -1106,19 +1112,22 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s # get scale for weight weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer()) + weight_scale = tensor_proto_to_array(weight_scale_initializer) # get scale for input input_scale_name = ( self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name ) input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) - - input_scale, weight_scale = self.adjust_other_input_scales_for_int32_bias( - input_scale_initializer, - weight_scale_initializer, - bias_initializer, - bias_info.beta, - ) + input_scale = tensor_proto_to_array(input_scale_initializer) + + if self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN: + input_scale, weight_scale = self.adjust_weight_scale_for_int32_bias( + input_scale_initializer, + weight_scale_initializer, + bias_initializer, + bias_info.beta, + ) ( quantized_bias_name, From 369d481e4fa74f9ce22293b8ed62f3e01c4f1b58 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Mon, 9 Sep 2024 00:56:56 -0700 Subject: [PATCH 03/10] Dont call unnecessary function --- .../tools/quantization/qdq_quantizer.py | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 83717e0fe9520..7028211e2ce3f 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1051,20 +1051,14 @@ def adjust_weight_scale_for_int32_bias( weight_scale_rank: int = len(weight_scale.shape) is_per_tensor: bool = weight_scale_rank == 0 or (weight_scale_rank == 1 and weight_scale.shape[0] == 1) - qmin = np.asarray(np.iinfo(np.int32).min) - qmax = np.asarray(np.iinfo(np.int32).max) + int32_info = np.iinfo(np.int32) + qrange = np.array(int32_info.max, dtype=np.float64) - np.array(int32_info.min, dtype=np.float64) if is_per_tensor: - tiny_float = np.finfo(bias_float_data.dtype).tiny - _, bias_true_scale = compute_scale_zp( - rmin=bias_float_data.min(), - rmax=bias_float_data.max(), - qmin=qmin, - qmax=qmax, - symmetric=True, - min_real_range=tiny_float, # Prevent compute_scale_zp() from using a scale of 1.0 for tiny scales - ) - bias_true_scale = np.asarray(bias_true_scale, dtype=np.float64) + rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64)) + rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64)) + absmax = np.maximum(np.abs(rmin), np.abs(rmax)) + bias_true_scale = (2.0 * absmax) / qrange bias_candidate_scale = np.asarray(input_scale * weight_scale * beta, dtype=np.float64) ratio = bias_true_scale / bias_candidate_scale @@ -1072,18 +1066,14 @@ def adjust_weight_scale_for_int32_bias( # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. weight_scale *= np.asarray(ratio, dtype=weight_scale.dtype) weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) - else: - assert weight_scale_rank == 1, "per-channel scales should be a 1D tensor" - assert len(bias_float_data.shape) == 1, "bias should be a 1D tensor for per-channel quant" - + elif weight_scale_rank == 1 and weight_scale.shape == bias_float_data.shape: + # per-channel case num_elems = weight_scale.shape[0] - assert num_elems == bias_float_data.shape[0], "Bias shape should match per-channel weight scale's shape" - updated_an_elem = False for i in range(num_elems): bias_rmax = np.abs(bias_float_data[i]) - bias_true_scale = (2.0 * bias_rmax) / (np.float64(qmax) - np.float64(qmin)) + bias_true_scale = (2.0 * bias_rmax) / qrange bias_candidate_scale = np.asarray(input_scale * weight_scale[i] * beta, dtype=np.float64) ratio = bias_true_scale / bias_candidate_scale From d4ff4386b6f869457b1bbaf871ed2ee9124ae635 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Mon, 9 Sep 2024 01:13:00 -0700 Subject: [PATCH 04/10] Only adjust for a beta of 1.0 --- .../tools/quantization/qdq_quantizer.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 7028211e2ce3f..555b083364e60 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1034,7 +1034,6 @@ def adjust_weight_scale_for_int32_bias( input_scale_tp: onnx.TensorProto, weight_scale_tp: onnx.TensorProto, bias_tp: onnx.TensorProto, - beta: float, ) -> tuple[np.ndarray, np.ndarray]: """ Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. @@ -1058,12 +1057,12 @@ def adjust_weight_scale_for_int32_bias( rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64)) rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64)) absmax = np.maximum(np.abs(rmin), np.abs(rmax)) - bias_true_scale = (2.0 * absmax) / qrange - bias_candidate_scale = np.asarray(input_scale * weight_scale * beta, dtype=np.float64) - ratio = bias_true_scale / bias_candidate_scale + bias_smallest_valid_scale = (2.0 * absmax) / qrange + bias_candidate_scale = np.asarray(input_scale * weight_scale, dtype=np.float64) - if ratio > 1.0: + if bias_candidate_scale < bias_smallest_valid_scale: # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + ratio = bias_smallest_valid_scale / bias_candidate_scale weight_scale *= np.asarray(ratio, dtype=weight_scale.dtype) weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) elif weight_scale_rank == 1 and weight_scale.shape == bias_float_data.shape: @@ -1073,12 +1072,12 @@ def adjust_weight_scale_for_int32_bias( for i in range(num_elems): bias_rmax = np.abs(bias_float_data[i]) - bias_true_scale = (2.0 * bias_rmax) / qrange - bias_candidate_scale = np.asarray(input_scale * weight_scale[i] * beta, dtype=np.float64) - ratio = bias_true_scale / bias_candidate_scale + bias_smallest_valid_scale = (2.0 * bias_rmax) / qrange + bias_candidate_scale = np.asarray(input_scale * weight_scale[i], dtype=np.float64) - if ratio > 1.0: + if bias_candidate_scale < bias_smallest_valid_scale: # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + ratio = bias_smallest_valid_scale / bias_candidate_scale weight_scale[i] *= np.asarray(ratio, dtype=weight_scale.dtype) updated_an_elem = True @@ -1111,12 +1110,11 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) input_scale = tensor_proto_to_array(input_scale_initializer) - if self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN: + if self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN and bias_info.beta == 1.0: input_scale, weight_scale = self.adjust_weight_scale_for_int32_bias( input_scale_initializer, weight_scale_initializer, bias_initializer, - bias_info.beta, ) ( From 6e1526b323a3a47730bcfbd704aef1decd0ded12 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Mon, 9 Sep 2024 03:09:41 -0700 Subject: [PATCH 05/10] Check for possible zero division --- .../python/tools/quantization/qdq_quantizer.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 555b083364e60..61c17105dfce2 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1033,7 +1033,7 @@ def adjust_weight_scale_for_int32_bias( self, input_scale_tp: onnx.TensorProto, weight_scale_tp: onnx.TensorProto, - bias_tp: onnx.TensorProto, + bias_float_data: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: """ Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. @@ -1043,8 +1043,6 @@ def adjust_weight_scale_for_int32_bias( """ input_scale: np.ndarray = tensor_proto_to_array(input_scale_tp) weight_scale: np.ndarray = tensor_proto_to_array(weight_scale_tp) - bias_float_data: np.ndarray = tensor_proto_to_array(bias_tp) - assert bias_float_data.size > 0, "Expect bias input to have some data" # Check the shape of the weight's scale to determine if using per-channel or per-tensor quantization. weight_scale_rank: int = len(weight_scale.shape) @@ -1060,7 +1058,7 @@ def adjust_weight_scale_for_int32_bias( bias_smallest_valid_scale = (2.0 * absmax) / qrange bias_candidate_scale = np.asarray(input_scale * weight_scale, dtype=np.float64) - if bias_candidate_scale < bias_smallest_valid_scale: + if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. ratio = bias_smallest_valid_scale / bias_candidate_scale weight_scale *= np.asarray(ratio, dtype=weight_scale.dtype) @@ -1075,7 +1073,7 @@ def adjust_weight_scale_for_int32_bias( bias_smallest_valid_scale = (2.0 * bias_rmax) / qrange bias_candidate_scale = np.asarray(input_scale * weight_scale[i], dtype=np.float64) - if bias_candidate_scale < bias_smallest_valid_scale: + if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. ratio = bias_smallest_valid_scale / bias_candidate_scale weight_scale[i] *= np.asarray(ratio, dtype=weight_scale.dtype) @@ -1095,9 +1093,6 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s if bias_name in self.quantized_value_map: return self.quantized_value_map[bias_name].original.q_name - bias_initializer = find_by_name(bias_name, self.model.initializer()) - assert bias_initializer is not None, "Bias should be a weight" - # get scale for weight weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer()) @@ -1111,10 +1106,13 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s input_scale = tensor_proto_to_array(input_scale_initializer) if self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN and bias_info.beta == 1.0: + bias_initializer = find_by_name(bias_name, self.model.initializer()) + bias_float_data = tensor_proto_to_array(bias_initializer) + input_scale, weight_scale = self.adjust_weight_scale_for_int32_bias( input_scale_initializer, weight_scale_initializer, - bias_initializer, + bias_float_data, ) ( From a7f504606eb183b7c9ffcbc4c0528f057dfc57a6 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Mon, 9 Sep 2024 03:27:19 -0700 Subject: [PATCH 06/10] Only return weight_scale --- .../python/tools/quantization/qdq_quantizer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 61c17105dfce2..ca60ec5092bf8 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1029,19 +1029,18 @@ def quantize_weight_per_channel( return q_weight_name, zp_name, scale_name - def adjust_weight_scale_for_int32_bias( + def _adjust_weight_scale_for_int32_bias( self, - input_scale_tp: onnx.TensorProto, + input_scale: np.ndarray, weight_scale_tp: onnx.TensorProto, bias_float_data: np.ndarray, - ) -> tuple[np.ndarray, np.ndarray]: + ) -> np.ndarray: """ Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be increased to prevent this from happening. """ - input_scale: np.ndarray = tensor_proto_to_array(input_scale_tp) weight_scale: np.ndarray = tensor_proto_to_array(weight_scale_tp) # Check the shape of the weight's scale to determine if using per-channel or per-tensor quantization. @@ -1082,7 +1081,7 @@ def adjust_weight_scale_for_int32_bias( if updated_an_elem: weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) - return (input_scale, weight_scale) + return weight_scale def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str: """ @@ -1109,8 +1108,8 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s bias_initializer = find_by_name(bias_name, self.model.initializer()) bias_float_data = tensor_proto_to_array(bias_initializer) - input_scale, weight_scale = self.adjust_weight_scale_for_int32_bias( - input_scale_initializer, + weight_scale = self._adjust_weight_scale_for_int32_bias( + input_scale, weight_scale_initializer, bias_float_data, ) From 1c1e6df05ad64324e94f9d7e21c48fb1d70971f5 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Tue, 15 Oct 2024 01:52:15 -0700 Subject: [PATCH 07/10] Add unit test for adjusting weight's scale for int32 bias --- .../tools/quantization/base_quantizer.py | 14 +- .../python/tools/quantization/onnx_model.py | 20 ++ .../tools/quantization/qdq_quantizer.py | 58 ++++- .../python/tools/quantization/quantize.py | 6 + .../test/python/quantization/test_qdq.py | 199 ++++++++++++++++++ 5 files changed, 285 insertions(+), 12 deletions(-) diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index e8ce9c7224e1b..caf22abfb98c4 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -230,9 +230,19 @@ def quantize_bias_static_impl(self, bias_name, input_scale, weight_scale, beta=1 # TODO: This formula should be explained including why the scale is not estimated for the bias as well. bias_scale = input_scale * weight_scale * beta + # Quantize by dividing by bias_scale quantized_data = np.asarray(bias_data, dtype=np.float64) / np.asarray(bias_scale, dtype=np.float64) - quantized_data = np.clip(quantized_data.round(), np.iinfo(np.int32).min, np.iinfo(np.int32).max) - quantized_data = quantized_data.astype(np.int32) + quantized_data = quantized_data.round() + + # Clip quantized data to the range of a int32 + int32_min = np.float64(np.iinfo(np.int32).min) + int32_max = np.float64(np.iinfo(np.int32).max) + if np.any(quantized_data < int32_min) or np.any(quantized_data > int32_max): + logging.warning( + f"Quantized bias `{bias_name}` exceeds the range of a int32. The bias scale is too small." + ) + + quantized_data = np.clip(quantized_data, int32_min, int32_max).astype(np.int32) # update bias initializer bias_np_data = np.asarray(quantized_data, dtype=np.int32).reshape(bias_initializer.dims) diff --git a/onnxruntime/python/tools/quantization/onnx_model.py b/onnxruntime/python/tools/quantization/onnx_model.py index 174bf5fd1509c..43105550139de 100644 --- a/onnxruntime/python/tools/quantization/onnx_model.py +++ b/onnxruntime/python/tools/quantization/onnx_model.py @@ -296,6 +296,26 @@ def get_largest_node_name_suffix(self, node_name_prefix): return suffix + def get_largest_initializer_name_suffix(self, initializer_name_prefix): + """ + Gets the largest initializer name integer suffix for all initializer names that begin + with `initializer_name_prefix`. This can be used to create unique initializer names. + + Example: for initializer names 'my_weight_0' and 'my_weight_3', this method returns 3 if + `initializer_name_prefix` is 'my_weight_'. + """ + suffix = -1 + + for initializer in self.model.graph.initializer: + if initializer.name.startswith(initializer_name_prefix): + try: + index = int(initializer.name[len(initializer_name_prefix) :]) + suffix = max(index, suffix) + except ValueError: + continue + + return suffix + def find_nodes_by_initializer(self, graph, initializer): """ Find all nodes with given initializer as an input. diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index ca60ec5092bf8..963f5b232f1a2 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -191,6 +191,9 @@ def __init__( # Used in the QDQRemovableActivation class. self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False) + # Let user disable adjustment of weight scales for bias inputs that are quantized to int32. + self.qdq_disable_weight_adjust_for_int32_bias = extra_options.get("QDQDisableWeightAdjustForInt32Bias", False) + # The ONNX spec did not support 16-bit Q/DQ ops before opset 21. # So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types # are 16-bit or 4-bit integers. @@ -359,7 +362,22 @@ def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, be if bias_name not in self.bias_to_quantize: self.bias_to_quantize[bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) else: - logging.warning(f"Bias {bias_name} has already been marked for quantization") + # This bias input is consumed by two different nodes. We need to duplicate the bias so that + # each node has its own bias input. This is necessary because the bias's scale is computed + # from the node's other input scales. + new_bias_suffix: int = self.model.get_largest_initializer_name_suffix(bias_name) + 1 + new_bias_name = f"{bias_name}{new_bias_suffix}" + new_weight = onnx.TensorProto() + new_weight.CopyFrom(weight) + new_weight.name = new_bias_name + self.model.add_initializer(new_weight) + + # Replace this node's bias input + self.model.replace_input_of_nodes(bias_name, new_bias_name, {node_name}) + + # Add this to our list of biases to quantize. + self.bias_to_quantize[new_bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) + logging.info(f"Created a copy of bias input '{bias_name}' called '{new_bias_name}'") else: logging.warning(f"Expected {bias_name} to be a weight") @@ -1033,7 +1051,7 @@ def _adjust_weight_scale_for_int32_bias( self, input_scale: np.ndarray, weight_scale_tp: onnx.TensorProto, - bias_float_data: np.ndarray, + bias_tp: onnx.TensorProto, ) -> np.ndarray: """ Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. @@ -1041,6 +1059,7 @@ def _adjust_weight_scale_for_int32_bias( be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be increased to prevent this from happening. """ + bias_float_data = tensor_proto_to_array(bias_tp) weight_scale: np.ndarray = tensor_proto_to_array(weight_scale_tp) # Check the shape of the weight's scale to determine if using per-channel or per-tensor quantization. @@ -1048,18 +1067,25 @@ def _adjust_weight_scale_for_int32_bias( is_per_tensor: bool = weight_scale_rank == 0 or (weight_scale_rank == 1 and weight_scale.shape[0] == 1) int32_info = np.iinfo(np.int32) + multiplicative_epsilon = 1.0001 qrange = np.array(int32_info.max, dtype=np.float64) - np.array(int32_info.min, dtype=np.float64) if is_per_tensor: rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64)) rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64)) absmax = np.maximum(np.abs(rmin), np.abs(rmax)) - bias_smallest_valid_scale = (2.0 * absmax) / qrange - bias_candidate_scale = np.asarray(input_scale * weight_scale, dtype=np.float64) + bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * absmax) / qrange + bias_candidate_scale = np.asarray(input_scale, dtype=np.float64) * np.asarray( + weight_scale, dtype=np.float64 + ) if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. ratio = bias_smallest_valid_scale / bias_candidate_scale + logging.info( + f"Increasing weight's scale `{weight_scale_tp.name}` by the ratio {ratio} to " + f"ensure bias input `{bias_tp.name}` has a valid scale." + ) weight_scale *= np.asarray(ratio, dtype=weight_scale.dtype) weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) elif weight_scale_rank == 1 and weight_scale.shape == bias_float_data.shape: @@ -1069,13 +1095,22 @@ def _adjust_weight_scale_for_int32_bias( for i in range(num_elems): bias_rmax = np.abs(bias_float_data[i]) - bias_smallest_valid_scale = (2.0 * bias_rmax) / qrange - bias_candidate_scale = np.asarray(input_scale * weight_scale[i], dtype=np.float64) + bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * bias_rmax) / qrange + bias_candidate_scale = np.asarray(input_scale, dtype=np.float64) * np.asarray( + weight_scale[i], dtype=np.float64 + ) if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. ratio = bias_smallest_valid_scale / bias_candidate_scale - weight_scale[i] *= np.asarray(ratio, dtype=weight_scale.dtype) + logging.info( + f"Increased scale[{i}] for weight scale `{weight_scale_tp.name}` by ratio {ratio} " + f"to ensure bias input `{bias_tp.name}` has a valid scale." + ) + new_value = np.asarray(weight_scale[i], dtype=weight_scale.dtype) * np.asarray( + ratio, dtype=weight_scale.dtype + ) + weight_scale[i] = np.asarray(new_value, dtype=weight_scale.dtype) updated_an_elem = True if updated_an_elem: @@ -1104,14 +1139,17 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) input_scale = tensor_proto_to_array(input_scale_initializer) - if self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN and bias_info.beta == 1.0: + if ( + self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN + and bias_info.beta == 1.0 + and not self.qdq_disable_weight_adjust_for_int32_bias + ): bias_initializer = find_by_name(bias_name, self.model.initializer()) - bias_float_data = tensor_proto_to_array(bias_initializer) weight_scale = self._adjust_weight_scale_for_int32_bias( input_scale, weight_scale_initializer, - bias_float_data, + bias_initializer, ) ( diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 745344dc01fcb..ff5d4ff34a2f0 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -192,6 +192,9 @@ def __init__( removed if activations are asymmetrically quantized. Keeping these activations is necessary if optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear operators from the model. + QDQDisableWeightAdjustForInt32Bias = True/False: + Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias + has a scale (input_scale * weight_scale) that is too small. execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc. Raises: ValueError: Raise ValueError if execution provider is unknown @@ -438,6 +441,9 @@ def quantize_static( removed if activations are asymmetrically quantized. Keeping these activations is necessary if optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear operators from the model. + QDQDisableWeightAdjustForInt32Bias = True/False: + Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias + has a scale (input_scale * weight_scale) that is too small. """ if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN: if calibrate_method != CalibrationMethod.Distribution: diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index b99c11abf6d2c..714b088b63919 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -1726,5 +1726,204 @@ def test_json_serialization(self): write_calibration_table(new_calibrate_tensors_range) +class TestAdjustWeightScaleForInt32Bias(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.adj_int32_bias_") + + # Note: swap with the commented line if you want to see the models in local test dir. + # cls._tmp_dir_path = cls._tmp_model_dir.name + cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_conv_test_model( + self, + input0_shape: list[int], + weight_shape: list[int], + onnx_float_type: onnx.TensorProto.DataType, + ): + np_float_type = onnx.helper.tensor_dtype_to_np_dtype(onnx_float_type) + input_0 = onnx.helper.make_tensor_value_info("input_0", onnx_float_type, input0_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", onnx_float_type, None) + + tiny_value = 1e-7 if np_float_type == np.float32 else 0.007782 + # weight_scale = 2*tiny_value / 255.0 = 7.84313725490196e-10 + + weight_data = np.full(weight_shape, tiny_value, dtype=np_float_type) + with np.nditer(weight_data, op_flags=["readwrite"]) as it: + for i, x in enumerate(it): + if i % 2 == 0: + x[...] = -x + + weight = onnx.numpy_helper.from_array(weight_data, "weight") + + # if we set input_scale to 0.05, then normally bias_scale would be + # (input_scale * weight_scale) => (0.05 * 7.84314e-10) => 3.9215686274509805e-11 + # + # If we quantize the f32 bias with this bias_scale, we get + # [5.0/bias_scale, 4.0/bias_scale] = [127500000000, 102000000000]. These quantized bias values exceed the + # range of int32. + # + # The ORT quantization tool will clamp these out-of-bounds values to int32::max(), + # which can be very inaccurate. + bias_shape = [weight_shape[0]] + bias_data = np.ones(bias_shape, dtype=np_float_type) + with np.nditer(bias_data, op_flags=["readwrite"]) as it: + for i, x in enumerate(it): + if i % 2 == 0: + x[...] = 5.0 if np_float_type == np.float32 else 1400 + else: + x[...] = -4.5 if np_float_type == np.float32 else -1200 + + bias = onnx.numpy_helper.from_array(bias_data, "bias") + + conv_node = onnx.helper.make_node("Conv", ["input_0", "weight", "bias"], ["output_0"], name="Conv0") + graph = onnx.helper.make_graph( + [conv_node], + "Convfloat", + [input_0], + [output_0], + initializer=[weight, bias], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_adjust_weight_scale_for_int32_bias(self): + """ + Test adjustment of weight input's scale to ensure int32 bias's scale is not too small. + """ + test_configs = [ + (onnx.TensorProto.FLOAT, True), + (onnx.TensorProto.FLOAT, False), + (onnx.TensorProto.FLOAT16, True), + (onnx.TensorProto.FLOAT16, False), + ] + + for float_type, per_channel in test_configs: + with self.subTest(float_type=float_type, per_channel=per_channel): + label = f"_f{float_type}_perchannel{per_channel}" + float_model_path = os.path.join(self._tmp_dir_path, f"conv{label}.float.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, f"conv{label}.qdq.onnx") + + # Create float model with a Conv that has tiny weight values. + # This tiny weight scale would normally create a very small bias scale that will saturate + # bias's int32 range. But, the qdq_quantizer adjusts the weight's scale to ensure this doesn't happen. + input0_shape = [1, 2, 4, 4] + weight_shape = [2, 2, 2, 2] + float_model = self.build_conv_test_model(input0_shape, weight_shape, float_type) + onnx.save_model(float_model, float_model_path) + + # Create a data reader + np_float_type = onnx.helper.tensor_dtype_to_np_dtype(float_type) + input0_rmin = 0.0 + input0_scale = 0.05 if float_type == onnx.TensorProto.FLOAT else 0.01 + input0_rmax = (input0_scale * 255.0) + input0_rmin + input_data_list = [ + {"input_0": np.full(input0_shape, input0_rmin, dtype=np_float_type)}, + {"input_0": np.full(input0_shape, (input0_rmax - input0_rmin) / 2.0, dtype=np_float_type)}, + {"input_0": np.full(input0_shape, input0_rmax, dtype=np_float_type)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # quantize model to QDQ + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + per_channel=per_channel, + ) + + # Check correctness + data_reader.rewind() + check_model_correctness(self, float_model_path, qdq_model_path, data_reader.get_next()) + + def build_model_convs_share_bias( + self, + input0_shape: list[int], + weight_shape: list[int], + onnx_float_type: onnx.TensorProto.DataType, + ): + np_float_type = onnx.helper.tensor_dtype_to_np_dtype(onnx_float_type) + input_0 = onnx.helper.make_tensor_value_info("input_0", onnx_float_type, input0_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", onnx_float_type, None) + output_1 = onnx.helper.make_tensor_value_info("output_1", onnx_float_type, None) + + weight_0_data = np.ones(weight_shape, dtype=np_float_type) + weight_0 = onnx.numpy_helper.from_array(weight_0_data, "weight_0") + + weight_1_data = np.full(weight_shape, 0.5, dtype=np_float_type) + weight_1 = onnx.numpy_helper.from_array(weight_1_data, "weight_1") + + bias_shape = [weight_shape[0]] + bias_data = np.ones(bias_shape, dtype=np_float_type) + bias_shared = onnx.numpy_helper.from_array(bias_data, "bias_shared") + + conv_0_node = onnx.helper.make_node("Conv", ["input_0", "weight_0", "bias_shared"], ["output_0"], name="Conv0") + conv_1_node = onnx.helper.make_node("Conv", ["input_0", "weight_1", "bias_shared"], ["output_1"], name="Conv1") + graph = onnx.helper.make_graph( + [conv_0_node, conv_1_node], + "ConvWithSharedBiasToDup", + [input_0], + [output_0, output_1], + initializer=[weight_0, weight_1, bias_shared], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + model = onnx.shape_inference.infer_shapes(model) + onnx.checker.check_model(model, True) + return model + + def test_dup_shared_bias(self): + """ + Test duplicating a bias that is shared by two nodes that want to quantize their bias to int32. + """ + float_model_path = os.path.join(self._tmp_dir_path, "convs_share_bias.float.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, "convs_share_bias.qdq.onnx") + + # Create float model with a Convs that share a bias input. The QDQ quantizer should add a + # duplicate bias so that each node has its own. + input0_shape = [1, 2, 4, 4] + weight_shape = [2, 2, 2, 2] + float_model = self.build_model_convs_share_bias(input0_shape, weight_shape, onnx.TensorProto.FLOAT) + onnx.save_model(float_model, float_model_path) + + # Create a data reader + input0_rmin = 0.0 + input0_scale = 0.05 + input0_rmax = (input0_scale * 255.0) + input0_rmin + input_data_list = [ + {"input_0": np.full(input0_shape, input0_rmin, dtype=np.float32)}, + {"input_0": np.full(input0_shape, (input0_rmax - input0_rmin) / 2.0, dtype=np.float32)}, + {"input_0": np.full(input0_shape, input0_rmax, dtype=np.float32)}, + ] + data_reader = TestDataFeeds(input_data_list) + + # quantize model to QDQ + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + ) + + qdq_model = onnx.load_model(qdq_model_path) + bias_names = set() + + for node in qdq_model.graph.node: + if node.op_type == "DequantizeLinear" and node.input[0].startswith("bias_shared"): + bias_names.add(node.input[0]) + + self.assertEqual(len(bias_names), 2) + + if __name__ == "__main__": unittest.main() From 23cb7b62f2a6bd502c68906c5770950e255bcdee Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Tue, 15 Oct 2024 01:56:16 -0700 Subject: [PATCH 08/10] Save unittest model to tmp dir --- onnxruntime/test/python/quantization/test_qdq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index 714b088b63919..24039fe7398a8 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -1732,8 +1732,8 @@ def setUpClass(cls): cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.adj_int32_bias_") # Note: swap with the commented line if you want to see the models in local test dir. - # cls._tmp_dir_path = cls._tmp_model_dir.name - cls._tmp_dir_path = "." + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." @classmethod def tearDownClass(cls): From 2d5d7ac7199b350b0da3f79b2f92cd13d0cbc9a8 Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Tue, 15 Oct 2024 09:26:00 -0700 Subject: [PATCH 09/10] Cite reference --- onnxruntime/python/tools/quantization/qdq_quantizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 963f5b232f1a2..dab3c5afde341 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -1058,6 +1058,10 @@ def _adjust_weight_scale_for_int32_bias( A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be increased to prevent this from happening. + + Although the adjustment method and amount differs, the idea to adjust the weight's scale came from the following + reference: + https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/optimize/quantization_utils.cc#L252 """ bias_float_data = tensor_proto_to_array(bias_tp) weight_scale: np.ndarray = tensor_proto_to_array(weight_scale_tp) From 78607c73aed91992d44279d5024a5f428e99058f Mon Sep 17 00:00:00 2001 From: adrianlizarraga Date: Thu, 31 Oct 2024 06:04:36 -0700 Subject: [PATCH 10/10] Update with latest --- .../tools/quantization/base_quantizer.py | 45 +- .../tools/quantization/qdq_quantizer.py | 600 +++++++++++------- .../python/tools/quantization/quant_utils.py | 196 +++++- .../quantization/tensor_quant_overrides.py | 4 + .../python/quantization/test_quant_util.py | 2 +- .../test_tensor_quant_overrides_option.py | 12 +- 6 files changed, 577 insertions(+), 282 deletions(-) diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index caf22abfb98c4..b12465ffa7926 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -21,7 +21,6 @@ from .quant_utils import ( ONNX_TYPE_TO_NP_TYPE, TENSOR_NAME_QUANT_SUFFIX, - QuantType, find_by_name, model_has_infer_metadata, normalize_axis, @@ -40,18 +39,26 @@ def __init__(self, **data: Dict[str, Any]): for k, v in data.items(): if not isinstance(k, str): raise TypeError(f"Keys must be strings not {type(k)} for k={k!r}.") - if not isinstance(v, (int, str, np.ndarray)): + if k != "axis" and not isinstance(v, (int, str, np.ndarray)): raise TypeError(f"Values must be numpy arrays, int, float, str not {type(v)} for k={k!r}.") + if k == "axis" and not isinstance(v, int) and v is not None: + raise TypeError(f"Axis value must be an int or None, not {type(v)}.") if k == "scale" and v.dtype not in (np.float32, np.float16): raise ValueError(f"scale must a float32 or float16 numpy element but is {v.dtype} for k={k!r}") self.data[k] = v + def get(self, key, default_value=None): + return self.data.get(key, default_value) + def __iter__(self): yield from self.data def __getitem__(self, key): return self.data[key] + def __setitem__(self, key, value): + self.data[key] = value + def __len__(self): return len(self.data) @@ -88,9 +95,10 @@ def __init__( self.force_quantize_no_input_check = ( "ForceQuantizeNoInputCheck" in self.extra_options and self.extra_options["ForceQuantizeNoInputCheck"] ) - self.is_weight_symmetric = self.extra_options.get( - "WeightSymmetric", weight_qType in (QuantType.QInt8, QuantType.QInt16, QuantType.QFLOAT8E4M3FN) - ) + + # If user does not explicitly set "WeightSymmetric", then the weight's quantization type determines + # the symmetry (i.e., signed integer types will use symmetric quantization). See `def is_weight_symmetric()` + self._is_weight_symmetric: bool | None = self.extra_options.get("WeightSymmetric", None) self.is_activation_symmetric = self.extra_options.get("ActivationSymmetric", False) self.min_real_range = self.extra_options.get("MinimumRealRange") @@ -131,6 +139,16 @@ def __init__( self.tensor_quant_override_qtypes = self.tensor_quant_overrides.get_quant_types() + def is_weight_symmetric(self, weight_quant_type: onnx.TensorProto.DataType) -> bool: + if self._is_weight_symmetric is not None: + return self._is_weight_symmetric # Return value explicitly set by user. + return weight_quant_type in ( + onnx.TensorProto.INT4, + onnx.TensorProto.INT8, + onnx.TensorProto.INT16, + onnx.TensorProto.FLOAT8E4M3FN, + ) + def quantize_model(self): raise NotImplementedError @@ -292,6 +310,7 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa If keep_float_weight is False, quantize the weight, or don't quantize the weight. :return: quantized weight name, zero point name, scale name """ + # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there. q_weight_name = weight.name + TENSOR_NAME_QUANT_SUFFIX zp_name = weight.name + "_zero_point" scale_name = weight.name + "_scale" @@ -313,10 +332,11 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa assert isinstance(scale, np.ndarray), f"Unexpected type {type(scale)}" else: - _, _, zero_point, scale, q_weight_data = quantize_data( + symmetric = self.is_weight_symmetric(qType) if qType == self.weight_qType else self.is_activation_symmetric + zero_point, scale, q_weight_data = quantize_data( weight_data.flatten(), qType, - quant_overrides.get("symmetric", self.is_weight_symmetric), + quant_overrides.get("symmetric", symmetric), reduce_range=quant_overrides.get("reduce_range", self.reduce_range and reduce_range), min_real_range=self.min_real_range, rmin_override=quant_overrides.get("rmin"), @@ -381,6 +401,7 @@ def quantize_weight_per_channel_impl( reduce_range=True, keep_float_weight=False, ): + # TODO(adrianlizarraga): This function is now only used by onnx_quantizer.py, so move it there. initializer = find_by_name(weight_name, self.model.initializer()) if initializer is None: raise ValueError("{} is not an initializer", weight_name) @@ -419,13 +440,7 @@ def quantize_weight_per_channel_impl( if "quant_type" in quant_overrides_for_channels[0]: weight_qType = quant_overrides_for_channels[0]["quant_type"].tensor_type # noqa: N806 - symmetric = quant_overrides_for_channels[0].get( - "symmetric", - ( - self.is_weight_symmetric - or weight_qType in (onnx.TensorProto.INT8, onnx.TensorProto.FLOAT8E4M3FN, onnx.TensorProto.INT4) - ), - ) + symmetric = quant_overrides_for_channels[0].get("symmetric", self.is_weight_symmetric(weight_qType)) reduce_range = quant_overrides_for_channels[0].get("reduce_range", self.reduce_range and reduce_range) zero_point_list = [] scale_list = [] @@ -454,7 +469,7 @@ def quantize_weight_per_channel_impl( ), f"Unexpected type {type(quantized_per_channel_data)}" else: - _, _, zero_point, scale, quantized_per_channel_data = quantize_data( + zero_point, scale, quantized_per_channel_data = quantize_data( per_channel_data.flatten(), weight_qType, symmetric, diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index dab3c5afde341..048c7f3296503 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -20,6 +20,7 @@ from .calibrate import TensorData from .quant_utils import ( DEQUANT_OP_NAME, + ONNX_TYPE_TO_NP_TYPE, QUANT_OP_NAME, QuantizedValue, QuantizedValueType, @@ -30,12 +31,14 @@ add_quant_input_suffix, add_quant_output_suffix, add_quant_suffix, + compute_data_quant_params, compute_scale_zp, compute_scale_zp_float8, find_by_name, get_qmin_qmax_for_qType, ms_domain, normalize_axis, + quantize_onnx_initializer, tensor_proto_to_array, ) from .registry import CreateQDQQuantizer @@ -86,6 +89,18 @@ class QDQTensorQuantParams: converted: QuantizationParams | None # Converted type consumed by some (or all/none) consumer nodes. converted_recv_nodes: set[str] | None # The name of nodes that consume the converted type. + def get_for_consumer(self, consumer_node_name) -> QuantizationParams: + if self.converted is None: # Quantized value is not converted, return original + return self.original + + if self.converted_recv_nodes is None: # All consumers receive the converted value + return self.converted + + # Check if consumer node name is in the list of nodes that + # receive the converted quantization value. If not, return the original value generated + # by the tensor's producer. + return self.converted if (consumer_node_name in self.converted_recv_nodes) else self.original + # Holds scale and zero_point initializer TensorProtos. @dataclass @@ -153,8 +168,8 @@ def __init__( op_types_to_quantize, extra_options, ) - self.tensors_to_quantize = {} - self.bias_to_quantize = {} + self.tensors_to_quantize: dict[str, QDQTensorQuantInfo] = {} + self.bias_to_quantize: dict[str, QDQBiasQuantInfo] = {} self.nodes_to_remove = [] @@ -216,6 +231,7 @@ def __init__( self.qdq_op_domain = ms_domain self.quantization_params = self.calc_graph_quant_params() + self.initializer_quant_params: dict[str, QuantizationParams] = {} # Map of all original value names to quantized value names self.quantized_value_map = {} @@ -331,6 +347,18 @@ def quantize_weight_tensor_per_channel(self, tensor_name, axis): else: logging.warning(f"only support per-channel quantization on weight. Tensor: {tensor_name} is not quantized.") + def _dup_initializer(self, initializer: onnx.TensorProto) -> onnx.TensorProto: + """ + Duplicates an existing initializer and adds it to the model. Returns the new initializer. + """ + name_suffix: int = self.model.get_largest_initializer_name_suffix(initializer.name) + 1 + new_initializer_name = f"{initializer.name}{name_suffix}" + new_initializer = onnx.TensorProto() + new_initializer.CopyFrom(initializer) + new_initializer.name = new_initializer_name + self.model.add_initializer(new_initializer) + return new_initializer + def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, beta=1.0): """ Adds a bias tensor to the list of bias tensors to quantize. Called by op quantizers that @@ -356,30 +384,160 @@ def quantize_bias_tensor(self, node_name, bias_name, input_name, weight_name, be self.quantize_weight_tensor(bias_name) return - weight = find_by_name(bias_name, self.model.initializer()) - if weight is not None: - if weight.data_type in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16): - if bias_name not in self.bias_to_quantize: - self.bias_to_quantize[bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) - else: - # This bias input is consumed by two different nodes. We need to duplicate the bias so that - # each node has its own bias input. This is necessary because the bias's scale is computed - # from the node's other input scales. - new_bias_suffix: int = self.model.get_largest_initializer_name_suffix(bias_name) + 1 - new_bias_name = f"{bias_name}{new_bias_suffix}" - new_weight = onnx.TensorProto() - new_weight.CopyFrom(weight) - new_weight.name = new_bias_name - self.model.add_initializer(new_weight) - - # Replace this node's bias input - self.model.replace_input_of_nodes(bias_name, new_bias_name, {node_name}) - - # Add this to our list of biases to quantize. - self.bias_to_quantize[new_bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) - logging.info(f"Created a copy of bias input '{bias_name}' called '{new_bias_name}'") - else: - logging.warning(f"Expected {bias_name} to be a weight") + bias_initializer = find_by_name(bias_name, self.model.initializer()) + if bias_initializer is None: + logging.warning(f"Expected bias '{bias_name}' to be an initializer") + return + + if bias_initializer.data_type not in (onnx_proto.TensorProto.FLOAT, onnx_proto.TensorProto.FLOAT16): + logging.info(f"Expected bias '{bias_name}' to be an floating-point initializer") + return + + actual_bias_name = bias_name + if bias_name in self.bias_to_quantize: + # This bias input is consumed by two different nodes. We need to duplicate the bias so that + # each node has its own bias input. This is necessary because the bias's scale is computed + # from the node's other input scales. + new_bias_initializer = self._dup_initializer(bias_initializer) + actual_bias_name = new_bias_initializer.name + + # Replace this node's bias input + self.model.replace_input_of_nodes(bias_name, actual_bias_name, {node_name}) + logging.info(f"Created a copy of bias input '{bias_name}' called '{actual_bias_name}'") + + # Add this to our list of biases to quantize. + self.bias_to_quantize[actual_bias_name] = QDQBiasQuantInfo(node_name, input_name, weight_name, beta) + + def _adjust_weight_scale_for_int32_bias( + self, + input_scale: np.ndarray, + weight_scale: np.ndarray, + weight_name: str, + bias_tp: onnx.TensorProto, + is_per_channel: bool, + ) -> tuple[bool, np.ndarray | None]: + """ + Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. + A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to + be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be + increased to prevent this from happening. + + Although the adjustment method and amount differs, the idea to adjust the weight's scale came from the following + reference: + https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/optimize/quantization_utils.cc#L252 + + :param input_scale: The input's scale. + :param weight_scale: The weight scale to potentially adjust. + :param weight_name: The weight initializer's name. Used for logging. + :param bias_tp: The bias ONNX initializer. + :param is_per_channel: True if the bias and weight are quantized per-channel. + :return: A tuple with a bool indicating if the weight's scale was adjusted and the new weight scale. + """ + if not weight_scale.size: + return False, None + + bias_float_data = tensor_proto_to_array(bias_tp) + + int32_info = np.iinfo(np.int32) + multiplicative_epsilon = 1.0001 + qrange = np.array(int32_info.max, dtype=np.float64) - np.array(int32_info.min + 1, dtype=np.float64) + weight_scale_dtype = weight_scale.dtype + updated_an_elem = False + + if not is_per_channel: + rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64)) + rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64)) + absmax = np.maximum(np.abs(rmin), np.abs(rmax)) + bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * absmax) / qrange + + input_scale_fp64 = np.array(input_scale.item(), dtype=np.float64) + weight_scale_fp64 = np.array(weight_scale.item(), dtype=np.float64) + bias_candidate_scale = input_scale_fp64 * weight_scale_fp64 + + if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): + # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + ratio = bias_smallest_valid_scale / bias_candidate_scale + logging.info( + f"Increasing scale for weight `{weight_name}` by the ratio {ratio} to " + f"ensure bias input `{bias_tp.name}` has a valid scale." + ) + new_scale = weight_scale_fp64 * ratio + weight_scale = new_scale.astype(weight_scale_dtype) + updated_an_elem = True + elif weight_scale.shape and len(weight_scale.shape) == 1: + # per-channel case + num_elems = weight_scale.shape[0] + + for i in range(num_elems): + bias_rmax = np.abs(bias_float_data[i]) + bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * bias_rmax) / qrange + + input_scale_fp64 = np.array(input_scale.item(), dtype=np.float64) + weight_scale_fp64 = np.array(weight_scale[i].item(), dtype=np.float64) + bias_candidate_scale = input_scale_fp64 * weight_scale_fp64 + if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): + # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. + ratio = bias_smallest_valid_scale / bias_candidate_scale + logging.info( + f"Increased scale[{i}] for weight `{weight_name}` by ratio {ratio} " + f"to ensure bias input `{bias_tp.name}` has a valid scale." + ) + new_scale = weight_scale_fp64 * ratio + weight_scale[i] = new_scale.astype(weight_scale_dtype) + updated_an_elem = True + + return updated_an_elem, weight_scale + + def _adjust_weight_quant_params_for_bias_tensors(self): + """ + Iterates through all bias inputs that should be quantized to int32. If the intended + bias scale (equal to input_scale * weight_scale) is too small, this function will increase + the associated weight's scale to ensure the bias does not overflow the int32 range when quantized. + """ + + if self.qdq_disable_weight_adjust_for_int32_bias: + # User passed an extra_option to disable this adjustment. + return + + for bias_name, bias_info in self.bias_to_quantize.items(): + if ( + bias_info.input_name not in self.quantization_params + or bias_info.input_name not in self.tensors_to_quantize + or bias_info.weight_name not in self.initializer_quant_params + ): + continue + + # Get the associated input's scale. + input_qparams = self.quantization_params[bias_info.input_name].get_for_consumer(bias_info.node_name) + input_info = self.tensors_to_quantize[bias_info.input_name] + input_scale = np.asarray( + input_qparams["scale"], dtype=onnx.helper.tensor_dtype_to_np_dtype(input_info.data_type) + ) + + weight_quant_params = self.initializer_quant_params[bias_info.weight_name] + weight_quant_type = weight_quant_params["quant_type"] + if weight_quant_type not in (onnx.TensorProto.INT8, onnx.TensorProto.INT16): + continue + + weight_zero_point: np.ndarray = weight_quant_params["zero_point"] + if weight_zero_point.any(): + # Skip if zero_point(s) are not all zero (i.e., symmetric quant) + continue + + weight_scale: np.ndarray = weight_quant_params["scale"] + is_per_channel = weight_quant_params.get("axis", None) is not None + + # Get adjusted weight scales. + did_update_weight_scale, new_weight_scale = self._adjust_weight_scale_for_int32_bias( + input_scale, + weight_scale, + bias_info.weight_name, + find_by_name(bias_name, self.model.initializer()), + is_per_channel, + ) + + if did_update_weight_scale: + weight_quant_params["scale"] = new_weight_scale def remove_node(self, node): self.nodes_to_remove.append(node) @@ -398,6 +556,8 @@ def quantize_model(self): self.tensor_to_its_receiving_nodes[tensor_name] = [] self.tensor_to_its_receiving_nodes[tensor_name].append(node) + self.initializer_quant_params = self._calc_initializer_quant_params() + self._adjust_weight_quant_params_for_bias_tensors() self._quantize_normal_tensors() self._quantize_sharing_param_tensors() if self.quantize_bias: @@ -493,38 +653,26 @@ def _create_qdq_nodes( ) self.model.add_nodes([qlinear_node, dequant_node]) - def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None): + def _add_qdq_nodes_for_initializer(self, weight_proto: onnx.TensorProto): + """ + Adds Q/DQ nodes for an initializer. If `self.add_qdq_pair_to_weight` is true, creates + the sequence (weight_f32 -> Q -> DQ -> ). Otherwise, this function quantizes the initializer + and adds the sequence (weight_quant -> DQ ->). + """ weight_name = weight_proto.name - if axis is not None: - if self.opset_version < 13: - raise ValueError("Per-Channel support with QDQ format requires onnx opset version 13 or above.") - - qtype = self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType - if qtype == onnx.onnx_pb.TensorProto.UINT8: - qtype = onnx_proto.TensorProto.INT8 - - q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel( - weight_name, - # Quantization type is forced to be TensorProto.INT8. - # when the expected value would be (see below) - # self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType. - # QLinearConv expects to have a unique value for all channels. - # This code does not enforce that but it is necessarily the case when the - # quantization is symmetric (as for INT8). - qtype, - axis, - keep_float_weight=self.add_qdq_pair_to_weight, - ) - else: - q_weight_name, zp_name, scale_name = self.quantize_initializer( - weight_proto, - self.weight_qType if tensor_type is QDQQuantTensorType.WEIGHT else self.activation_qType, - keep_float_weight=self.add_qdq_pair_to_weight, - ) + if weight_name in self.quantized_value_map: + return + quant_params: QuantizationParams = self.initializer_quant_params[weight_name] + axis: int = quant_params.get("axis") + scale_zp_initializers = self._make_scale_zp_initializers(weight_name, quant_params) + q_weight_name: str | None = None weight_dequant_output = add_dequant_output_suffix(weight_name) self.model.replace_input_of_all_nodes(weight_name, weight_dequant_output) + if self.add_qdq_pair_to_weight: + # Don't actually quantize the weight. Instead, keep floating-point weight and create the node + # sequence (weight_f32 -> Q -> DQ -> weight_dequant) weight_quant_output = add_quant_output_suffix(weight_name) self._create_qdq_nodes( @@ -534,14 +682,26 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None): weight_quant_output, weight_dequant_output, add_dequant_suffix(weight_name), - scale_name, - zp_name, + scale_zp_initializers.scale.name, + scale_zp_initializers.zero_point.name, axis, ) else: + # Quantize the weight and create the node sequence: + # (weight_quantized -> DQ -> weight_dequant) + quant_weight = quantize_onnx_initializer( + weight_proto, + quant_params["quant_type"], + quant_params["zero_point"], + quant_params["scale"], + axis, + ) + self.model.add_initializer(quant_weight) + + q_weight_name = quant_weight.name dequant_node = onnx.helper.make_node( DEQUANT_OP_NAME, - [q_weight_name, scale_name, zp_name], + [quant_weight.name, scale_zp_initializers.scale.name, scale_zp_initializers.zero_point.name], [weight_dequant_output], add_dequant_suffix(weight_name), axis=axis, @@ -549,6 +709,17 @@ def _add_qdq_pair_for_initializer(self, weight_proto, tensor_type, axis=None): ) self.model.add_node(dequant_node) + # Log entry for this quantized weight + quantized_value = QuantizedValue( + weight_name, + q_weight_name, + scale_zp_initializers.scale.name, + scale_zp_initializers.zero_point.name, + QuantizedValueType.Initializer, + axis=axis, + ) + self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None) + def _add_qdq_pair_for_activation(self, tensor_name, scale_name, zp_name, data_type=None): if ( self.dedicated_qdq_pair @@ -785,7 +956,7 @@ def _quantize_normal_tensors(self): # Quantize the input initializer = find_by_name(tensor_name, self.model.initializer()) if initializer: - self._add_qdq_pair_for_initializer(initializer, tensor_info.tensor_type, tensor_info.axis) + self._add_qdq_nodes_for_initializer(initializer) else: tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name) if not tensor_qparam_initializers: @@ -927,45 +1098,6 @@ def _quantize_bias_tensors(self): def is_tensor_quantized(self, tensor_name: str): return tensor_name in self.tensors_to_quantize or tensor_name in self.bias_to_quantize - def quantize_initializer( - self, - weight: onnx.TensorProto, - qType: onnx.TensorProto.DataType, - reduce_range: bool = False, - keep_float_weight: bool = False, - ) -> tuple[str, str, str]: - """ - :param weight: TensorProto initializer - :param qType: type to quantize to - :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point. - If keep_float_weight is False, quantize the weight, or don't quantize the weight. - :return: quantized weight name, zero point name, scale name - """ - # Find if this input is already quantized - if weight.name in self.quantized_value_map: - quantized_value = self.quantized_value_map[weight.name].original - return ( - quantized_value.q_name, - quantized_value.zp_name, - quantized_value.scale_name, - ) - - q_weight_name, zp_name, scale_name = self.quantize_initializer_impl( - weight, qType, reduce_range, keep_float_weight - ) - - # Log entry for this quantized weight - quantized_value = QuantizedValue( - weight.name, - q_weight_name, - scale_name, - zp_name, - QuantizedValueType.Initializer, - None, - ) - self.quantized_value_map[weight.name] = QDQTensorQuantizedValue(quantized_value, None, None) - return q_weight_name, zp_name, scale_name - def is_tensor_per_channel( self, tensor_name: str, @@ -1015,113 +1147,6 @@ def is_tensor_per_channel( return True, axis - def quantize_weight_per_channel( - self, - weight_name: str, - weight_qType: onnx.TensorProto.DataType, - channel_axis: int, - reduce_range: bool = True, - keep_float_weight: bool = False, - ) -> tuple[str, str, str]: - # Find if this input is already quantized - if weight_name in self.quantized_value_map: - quantized_value = self.quantized_value_map[weight_name].original - return ( - quantized_value.q_name, - quantized_value.zp_name, - quantized_value.scale_name, - ) - - q_weight_name, zp_name, scale_name = self.quantize_weight_per_channel_impl( - weight_name, weight_qType, channel_axis, reduce_range, keep_float_weight - ) - quantized_value = QuantizedValue( - weight_name, - q_weight_name, - scale_name, - zp_name, - QuantizedValueType.Initializer, - None, - ) - self.quantized_value_map[weight_name] = QDQTensorQuantizedValue(quantized_value, None, None) - - return q_weight_name, zp_name, scale_name - - def _adjust_weight_scale_for_int32_bias( - self, - input_scale: np.ndarray, - weight_scale_tp: onnx.TensorProto, - bias_tp: onnx.TensorProto, - ) -> np.ndarray: - """ - Checks if the bias scale (input_scale * weight_scale) that we intend to use is too small. - A bias scale that is too small leads to quantized bias values that fall outside the range of a int32 and have to - be clipped, which decreases accuracy. If this function detects such a scenario, the weight_scale value will be - increased to prevent this from happening. - - Although the adjustment method and amount differs, the idea to adjust the weight's scale came from the following - reference: - https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/optimize/quantization_utils.cc#L252 - """ - bias_float_data = tensor_proto_to_array(bias_tp) - weight_scale: np.ndarray = tensor_proto_to_array(weight_scale_tp) - - # Check the shape of the weight's scale to determine if using per-channel or per-tensor quantization. - weight_scale_rank: int = len(weight_scale.shape) - is_per_tensor: bool = weight_scale_rank == 0 or (weight_scale_rank == 1 and weight_scale.shape[0] == 1) - - int32_info = np.iinfo(np.int32) - multiplicative_epsilon = 1.0001 - qrange = np.array(int32_info.max, dtype=np.float64) - np.array(int32_info.min, dtype=np.float64) - - if is_per_tensor: - rmin = np.minimum(bias_float_data.min(), np.array(0, dtype=np.float64)) - rmax = np.maximum(bias_float_data.max(), np.array(0, dtype=np.float64)) - absmax = np.maximum(np.abs(rmin), np.abs(rmax)) - bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * absmax) / qrange - bias_candidate_scale = np.asarray(input_scale, dtype=np.float64) * np.asarray( - weight_scale, dtype=np.float64 - ) - - if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): - # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. - ratio = bias_smallest_valid_scale / bias_candidate_scale - logging.info( - f"Increasing weight's scale `{weight_scale_tp.name}` by the ratio {ratio} to " - f"ensure bias input `{bias_tp.name}` has a valid scale." - ) - weight_scale *= np.asarray(ratio, dtype=weight_scale.dtype) - weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) - elif weight_scale_rank == 1 and weight_scale.shape == bias_float_data.shape: - # per-channel case - num_elems = weight_scale.shape[0] - updated_an_elem = False - - for i in range(num_elems): - bias_rmax = np.abs(bias_float_data[i]) - bias_smallest_valid_scale = multiplicative_epsilon * (2.0 * bias_rmax) / qrange - bias_candidate_scale = np.asarray(input_scale, dtype=np.float64) * np.asarray( - weight_scale[i], dtype=np.float64 - ) - - if (bias_candidate_scale < bias_smallest_valid_scale) and (bias_candidate_scale > 0.0): - # The candidate bias scale would be too small, so increase the weight_scale by the necessary ratio. - ratio = bias_smallest_valid_scale / bias_candidate_scale - logging.info( - f"Increased scale[{i}] for weight scale `{weight_scale_tp.name}` by ratio {ratio} " - f"to ensure bias input `{bias_tp.name}` has a valid scale." - ) - new_value = np.asarray(weight_scale[i], dtype=weight_scale.dtype) * np.asarray( - ratio, dtype=weight_scale.dtype - ) - weight_scale[i] = np.asarray(new_value, dtype=weight_scale.dtype) - updated_an_elem = True - - if updated_an_elem: - weight_scale_tp.CopyFrom(onnx.numpy_helper.from_array(weight_scale, weight_scale_tp.name)) - - return weight_scale - def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str: """ Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale @@ -1143,19 +1168,6 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) input_scale = tensor_proto_to_array(input_scale_initializer) - if ( - self.weight_qType != onnx.TensorProto.FLOAT8E4M3FN - and bias_info.beta == 1.0 - and not self.qdq_disable_weight_adjust_for_int32_bias - ): - bias_initializer = find_by_name(bias_name, self.model.initializer()) - - weight_scale = self._adjust_weight_scale_for_int32_bias( - input_scale, - weight_scale_initializer, - bias_initializer, - ) - ( quantized_bias_name, quantized_bias_scale_name, @@ -1180,7 +1192,7 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s return quantized_bias_name def _make_scale_zp_initializers( - self, param_name: str, params: QuantizationParams, init_name_suffix: str = "" + self, param_name: str, quant_params: QuantizationParams, init_name_suffix: str = "" ) -> QDQScaleZpInitializers: """ Creates and returns scale and zero-point initializers for the given quantization params. The initializers are @@ -1188,31 +1200,31 @@ def _make_scale_zp_initializers( - {param_name}_zero_point{init_name_suffix} - {param_name}_scale{init_name_suffix} """ - zero_point_values = np.array([params["zero_point"]]) - if not hasattr(params["scale"], "dtype") or params["scale"].dtype not in (np.float32, np.float16): - raise ValueError(f"Unexpected type {type(params['scale'])} and param_name={param_name!r}") - scale_values = np.array([params["scale"]]) - assert scale_values.dtype != np.float64 - zero_point_type = params.data.get("quant_type", self.activation_qType) - - zero_point_shape = [] + zero_point = quant_params["zero_point"] + scale = quant_params["scale"] + zero_point_type = quant_params["quant_type"] + axis: int | None = quant_params.get("axis") + assert (axis is not None and len(scale.shape) == 1) or ( + axis is None and len(scale.shape) == 0 + ), "Wrong scale/zp shapes" + assert len(scale.shape) == len(zero_point.shape), "Scale and zero-point must have the same rank" + zero_point_name = param_name + "_zero_point" + init_name_suffix - scale_shape = [] scale_name = param_name + "_scale" + init_name_suffix # Add initializers to model init_zp = onnx.helper.make_tensor( - zero_point_name, zero_point_type, zero_point_shape, zero_point_values.ravel().tolist() + zero_point_name, zero_point_type, zero_point.shape, zero_point.ravel().tolist() ) self.model.add_initializer(init_zp) - if scale_values.dtype == np.float32: + if scale.dtype == np.float32: scale_type = onnx_proto.TensorProto.FLOAT - elif scale_values.dtype == np.float16: + elif scale.dtype == np.float16: scale_type = onnx_proto.TensorProto.FLOAT16 else: - raise ValueError(f"Unexpected dtype={scale_values.dtype} for param_name={param_name!r}") - init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale_shape, scale_values.reshape((-1,)).tolist()) + raise ValueError(f"Unexpected dtype={scale.dtype} for param_name={param_name!r}") + init_scale = onnx.helper.make_tensor(scale_name, scale_type, scale.shape, scale.ravel().tolist()) self.model.add_initializer(init_scale) return QDQScaleZpInitializers(init_scale, init_zp) @@ -1261,7 +1273,7 @@ def calc_quant_params(self, tensor_data: TensorData, quant_overrides: dict[str, qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range=reduce_range, symmetric=symmetric) zero, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, self.min_real_range) - return QuantizationParams(zero_point=zero, scale=scale, quant_type=quant_type) + return QuantizationParams(zero_point=zero.squeeze(), scale=scale.squeeze(), quant_type=quant_type) def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]: """ @@ -1291,3 +1303,127 @@ def calc_graph_quant_params(self) -> dict[str, QDQTensorQuantParams]: quantization_params[tensor_name] = QDQTensorQuantParams(original, converted, converted_recv_nodes) return quantization_params + + def _calc_initializer_quant_params(self) -> dict[str, QuantizationParams]: + """ + Returns quantization parameters (scale/zero_point/quant_type) for all initializers. + """ + + quantization_params: dict[str, QuantizationParams] = {} + for tensor_name, tensor_info in self.tensors_to_quantize.items(): + initializer = find_by_name(tensor_name, self.model.initializer()) + if not initializer: + continue + + initializer_data = tensor_proto_to_array(initializer) + initializer_rank = len(initializer_data.shape) + + # initializers for elementwise ops use the quant_type for activations. + is_weight = tensor_info.tensor_type is QDQQuantTensorType.WEIGHT + quant_type = self.weight_qType if is_weight else self.activation_qType + + # Try to get scale/zp directly from user's overrides and avoid computation. + if self.tensor_quant_overrides.overrides_scale_zp(tensor_name): + overrides = self.tensor_quant_overrides[tensor_name] + if "quant_type" in overrides[0]: + quant_type = overrides[0]["quant_type"].tensor_type + + zp_dtype = ONNX_TYPE_TO_NP_TYPE[quant_type] + is_per_channel = "axis" in overrides[0] + if not is_per_channel: + quantization_params[tensor_name] = QuantizationParams( + zero_point=np.array(overrides[0]["zero_point"], dtype=zp_dtype), + scale=np.array(overrides[0]["scale"], initializer_data.dtype), + quant_type=quant_type, + ) + else: + zero_points_list = [] + scales_list = [] + for chan_overrides in overrides: + zero_points_list.append(np.array(chan_overrides["zero_point"], zp_dtype)) + scales_list.append(np.array(chan_overrides["scale"], dtype=initializer_data.dtype)) + + channel_axis = overrides[0]["axis"] + is_axis_valid, norm_channel_axis = normalize_axis(channel_axis, initializer_rank) + if not is_axis_valid: + raise ValueError( + f"Weight {initializer.name} has a per-channel axis with value {channel_axis} that is " + f"out-of-bounds for rank {initializer_rank}" + ) + + quantization_params[tensor_name] = QuantizationParams( + zero_point=np.array(zero_points_list), + scale=np.array(scales_list), + quant_type=quant_type, + axis=norm_channel_axis, + ) + + continue + + # Compute scale/zp normally. User's overrides may still override parameters + # used to compute the scale/zp (e.g., rmin, rmax, symmetric, etc.) + overrides = self.tensor_quant_overrides.get(tensor_name, [{}]) + if "quant_type" in overrides[0]: + quant_type = overrides[0]["quant_type"].tensor_type + + channel_axis = overrides[0].get("axis", tensor_info.axis) + is_per_channel = channel_axis is not None + + # Note: always quantize per-channel initializers as symmetric because QLinear* ops require the + # same zero-point in every channel, which is necessarily the case for symmetric quantization. + is_symmetric_default = is_per_channel or ( + self.is_weight_symmetric(quant_type) if is_weight else self.is_activation_symmetric + ) + is_symmetric = overrides[0].get("symmetric", is_symmetric_default) + reduce_range = overrides[0].get("reduce_range", self.reduce_range) + zero_point: np.ndarray | None = None + scale: np.ndarray | None = None + + if not is_per_channel: + zero_point, scale = compute_data_quant_params( + initializer_data.flatten(), + quant_type, + is_symmetric, + reduce_range=reduce_range, + min_real_range=self.min_real_range, + rmin_override=overrides[0].get("rmin"), + rmax_override=overrides[0].get("rmax"), + ) + else: + is_axis_valid, norm_channel_axis = normalize_axis(channel_axis, initializer_rank) + if not is_axis_valid: + raise ValueError( + f"Weight {initializer.name} has a per-channel axis with value {channel_axis} that is " + f"out-of-bounds for rank {initializer_rank}" + ) + + channel_axis = norm_channel_axis + channel_count = initializer_data.shape[channel_axis] + zero_points_list = [] + scales_list = [] + for i in range(channel_count): + per_channel_data = initializer_data.take(i, channel_axis) + channel_overrides = overrides[i] if overrides and i < len(overrides) else {} + channel_zero_point, channel_scale = compute_data_quant_params( + per_channel_data.ravel(), + quant_type, + is_symmetric, + reduce_range=reduce_range, + min_real_range=self.min_real_range, + rmin_override=channel_overrides.get("rmin"), + rmax_override=channel_overrides.get("rmax"), + ) + zero_points_list.append(channel_zero_point) + scales_list.append(channel_scale) + + zero_point = np.asarray(zero_points_list) + scale = np.asarray(scales_list) + + quantization_params[tensor_name] = QuantizationParams( + zero_point=zero_point, + scale=scale, + quant_type=quant_type, + axis=channel_axis, + ) + + return quantization_params diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index 9228ad33130f2..05daa33bd9d76 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -33,6 +33,12 @@ int4 = None uint4 = None +try: + from onnx.reference.op_run import to_array_extended +except ImportError: + # old version of onnx. + to_array_extended = None + __producer__ = "onnx.quantize" __version__ = "0.1.0" @@ -156,7 +162,9 @@ def from_string(format): } ONNX_INT_TYPE_SYMMETRIC_RANGE = { + onnx_proto.TensorProto.UINT8: (numpy.array(0, dtype=numpy.uint8), numpy.array(254, dtype=numpy.uint8)), onnx_proto.TensorProto.INT8: (numpy.array(-127, dtype=numpy.int8), numpy.array(127, dtype=numpy.int8)), + onnx_proto.TensorProto.UINT16: (numpy.array(0, dtype=numpy.uint16), numpy.array(65534, dtype=numpy.uint16)), onnx_proto.TensorProto.INT16: (numpy.array(-32767, dtype=numpy.int16), numpy.array(32767, dtype=numpy.int16)), } @@ -229,7 +237,7 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None): # which matches the python reference ONNX implementation of QuantizeLinear. # This data can be packed into 4-bit elements by using pack_bytes_to_4bit(). dtype = ONNX_TYPE_TO_NP_TYPE[qType] - (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True) + qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False) cliplow = max(qmin, low) if low is not None else qmin cliphigh = min(qmax, high) if high is not None else qmax @@ -269,7 +277,7 @@ def compute_scale_zp(rmin, rmax, qmin, qmax, symmetric=False, min_real_range=Non # Ensure a minimum float-point range if specified. if min_real_range is not None: - rmax = max(rmax, rmin + min_real_range) + rmax = max(rmax, rmin + numpy.asarray(min_real_range, dtype=rmin.dtype)) if symmetric: absmax = numpy.maximum(numpy.abs(rmin), numpy.abs(rmax)) @@ -338,13 +346,75 @@ def compute_scale_zp_float8(element_type, std): return [zero, scale] +def compute_data_quant_params( + data: numpy.ndarray, + quant_type: onnx.TensorProto.DataType, + symmetric: bool, + reduce_range: bool = False, + min_real_range: float | None = None, + rmin_override: float | None = None, + rmax_override: float | None = None, +) -> tuple[numpy.ndarray, numpy.ndarray]: + """ + Returns the zero_point and scale for the given data. + + :param data: The data for which to compute quantization parameters. + :param quant_type: The quantization data type. + :param symmetric: whether symmetric quantization is used or not. + :parameter reduce_range: True if the quantization range should be reduced. Defaults to False. + :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None. + :parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data). + :parameter rmax_override: The value of rmax to use if not None. Otherwise, uses max(data). + :return: zero point and scale + """ + if not isinstance(data, numpy.ndarray): + raise TypeError(f"Weight must be given as an array not {type(data)}.") + if rmin_override is not None: + rmin = rmin_override + else: + rmin = data.min() if len(data) else 0.0 + + if rmax_override is not None: + rmax = rmax_override + else: + rmax = data.max() if len(data) else 0.0 + + rmin = numpy.array(rmin, dtype=data.dtype) + rmax = numpy.array(rmax, dtype=data.dtype) + scale = numpy.array(1.0, dtype=data.dtype) + + if quant_type == TensorProto.FLOAT8E4M3FN: + if reduce_range: + raise RuntimeError("Unsupported option reduce_range=True for float 8.") + std = numpy.std(data) + zero_point, scale = compute_scale_zp_float8(quant_type, std) + return _check_type(zero_point, scale, zero_point_index=0) + + if quant_type in ( + TensorProto.INT8, + TensorProto.UINT8, + TensorProto.INT16, + TensorProto.UINT16, + TensorProto.INT4, + TensorProto.UINT4, + ): + qmin, qmax = get_qmin_qmax_for_qType(quant_type, reduce_range, symmetric=symmetric) + if len(data): + zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range) + else: + zero_point = numpy.array(0, dtype=qmin.dtype) + return _check_type(zero_point, scale, zero_point_index=0) + + raise ValueError(f"Unexpected value for quant_type={quant_type}.") + + def quantize_data( data, qType, symmetric, reduce_range=False, min_real_range=None, rmin_override=None, rmax_override=None -): +) -> tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]: """ :param data: data to quantize - :param qType: data type to quantize to. Supported types UINT8 and INT8 - :param symmetric: whether symmetric quantization is used or not. This is applied to INT8. + :param qType: data type to quantize to. + :param symmetric: whether symmetric quantization is used or not. :parameter reduce_range: True if the quantization range should be reduced. Defaults to False. :parameter min_real_range: Minimum floating-point range (i.e., rmax - rmin) to enforce. Defaults to None. :parameter rmin_override: The value of rmin to use if not None. Otherwise, uses min(data). @@ -366,28 +436,16 @@ def quantize_data( - *S*: scale - *z*: zero point """ - if not isinstance(data, numpy.ndarray): - raise TypeError(f"Weight must be given as an array not {type(data)}.") - if rmin_override is not None: - rmin = rmin_override - else: - rmin = data.min() if len(data) else 0.0 - - if rmax_override is not None: - rmax = rmax_override - else: - rmax = data.max() if len(data) else 0.0 - - rmin = numpy.array(rmin, dtype=data.dtype) - rmax = numpy.array(rmax, dtype=data.dtype) - zero_point = 0 - scale = numpy.array(1.0, dtype=data.dtype) - + zero_point, scale = compute_data_quant_params( + data, + qType, + symmetric, + reduce_range, + min_real_range, + rmin_override, + rmax_override, + ) if qType == TensorProto.FLOAT8E4M3FN: - if reduce_range: - raise RuntimeError("Unsupported option reduce_range=True for float 8.") - std = numpy.std(data) - zero_point, scale = compute_scale_zp_float8(qType, std) quantized_data = quantize_nparray(qType, data, scale, zero_point) if any((quantized_data.astype(numpy.uint8).ravel() & 127) == 127): np_data = numpy.asarray(data) @@ -395,7 +453,7 @@ def quantize_data( f"One of the quantized value is NaN data in [{np_data.min()}, {np_data.max()}], " f"quantized_data in [{quantized_data.min()}, {quantized_data.max()}]." ) - return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2) + return zero_point, scale, quantized_data if qType in ( TensorProto.INT8, @@ -405,15 +463,91 @@ def quantize_data( TensorProto.INT4, TensorProto.UINT4, ): - if len(data): - qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, symmetric=symmetric) - zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric, min_real_range) quantized_data = quantize_nparray(qType, data, scale, zero_point) - return _check_type(rmin, rmax, zero_point, scale, quantized_data, zero_point_index=2) + return zero_point, scale, quantized_data raise ValueError(f"Unexpected value for qType={qType}.") +def quantize_onnx_initializer( + weight: onnx.TensorProto, + quant_type: onnx.TensorProto.DataType, + zero_point: numpy.ndarray, + scale: numpy.ndarray, + axis: int | None = None, + quant_weight_name: str | None = None, +) -> onnx.TensorProto: + """ + Returns a quantized version of the given ONNX initializer. + + :param weight: The ONNX initializer to quantize. + :param quant_type: The final quantized data type. + :param zero_point: The zero-point value to use for quantization. + :param scale: The scale value to use for quantization. + :param axis: The quantization axis if quantizing per-channel. Defaults to None. + :param quant_weight_name: The name of the quantized initializer. + If not specified, the quantized name is generated. + :return: The quantized ONNX initializer. + """ + weight_data = tensor_proto_to_array(weight) + q_weight_data: numpy.ndarray | None = None + + if axis is None: # Per-tensor quantization + q_weight_data = quantize_nparray(quant_type, weight_data.ravel(), scale, zero_point) + else: # Per-channel quantization + channel_count = weight_data.shape[axis] + channel_dims = list(weight_data.shape) # deep copy + channel_dims[axis] = 1 # only one per channel for reshape + quantized_channel_data_list = [] + + for i in range(channel_count): + channel_data = weight_data.take(i, axis) + channel_scale = scale[i] + channel_zero_point = zero_point[i] + quantized_channel_data = quantize_nparray( + quant_type, channel_data.ravel(), channel_scale, channel_zero_point + ) + quantized_channel_data_list.append(numpy.asarray(quantized_channel_data).reshape(channel_dims)) + + q_weight_data = numpy.concatenate(quantized_channel_data_list, axis) + + q_weight_name = quant_weight_name if quant_weight_name else f"{weight.name}{TENSOR_NAME_QUANT_SUFFIX}" + + if quant_type == onnx.TensorProto.FLOAT8E4M3FN: + q_weight_initializer = onnx.TensorProto() + q_weight_initializer.data_type = quant_type + q_weight_initializer.dims.extend(weight.dims) + q_weight_initializer.name = q_weight_name + # Do not remove .flatten().copy() numpy is not clear about data persistence. + q_weight_initializer.raw_data = q_weight_data.flatten().copy().tobytes() + if to_array_extended is not None: + # This test should not be needed but it helped catch some issues + # with data persistence and tobytes. + check = to_array_extended(q_weight_initializer) + if check.shape != weight_data.shape or check.tobytes() != q_weight_data.tobytes(): + raise RuntimeError( + f"The initializer of shape {weight_data.shape} could not be created, expecting " + f"{q_weight_data.tobytes()[:10]}, got {check.tobytes()[:10]} and shape={weight.shape}" + f"\nraw={str(q_weight_initializer)[:200]}." + ) + elif quant_type in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4): + if q_weight_data.dtype not in (numpy.int8, numpy.uint8): + raise RuntimeError(f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values.") + + # We do not use onnx.helper.pack_float32_to_4bit() due to performance. + # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes. + packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes())) + + # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161 + q_weight_initializer = onnx.helper.make_tensor(q_weight_name, quant_type, weight.dims, packed_data, raw=True) + else: + quant_np_dtype = onnx.helper.tensor_dtype_to_np_dtype(quant_type) + q_weight_data = numpy.asarray(q_weight_data, dtype=quant_np_dtype).reshape(weight.dims) + q_weight_initializer = onnx.numpy_helper.from_array(q_weight_data, q_weight_name) + + return q_weight_initializer + + def get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=False): # noqa: N802 """ Return qmin and qmax, the minimum and maximum value representable by the given qType diff --git a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py index 219d929d22fce..fbd0cc17f5d81 100644 --- a/onnxruntime/python/tools/quantization/tensor_quant_overrides.py +++ b/onnxruntime/python/tools/quantization/tensor_quant_overrides.py @@ -78,6 +78,10 @@ def has_per_channel_overrides(self, tensor_name: str) -> bool: overrides_list = self.overrides.get(tensor_name) return overrides_list and "axis" in overrides_list[0] + def overrides_scale_zp(self, tensor_name: str) -> bool: + overrides_list = self.overrides.get(tensor_name) + return overrides_list and ("scale" in overrides_list[0]) and ("zero_point" in overrides_list[0]) + def get_per_tensor_overrides( self, tensor_name: str, diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py index 96d841654adbd..b23d53f2a04e8 100644 --- a/onnxruntime/test/python/quantization/test_quant_util.py +++ b/onnxruntime/test/python/quantization/test_quant_util.py @@ -145,7 +145,7 @@ def test_quantize_data_4bit(self): for onnx_type, symmetric in subtest_configs: with self.subTest(onnx_type=onnx_type, symmetric=symmetric): - _, _, zero_point, scale, data_quant = quantize_data(data_float, onnx_type, symmetric) + zero_point, scale, data_quant = quantize_data(data_float, onnx_type, symmetric) is_signed = onnx_type == onnx.TensorProto.INT4 np_int_type = numpy.int8 if is_signed else numpy.uint8 qmin = numpy.array(-8 if is_signed else 0, dtype=np_int_type) diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index 21a772c5f56c7..41dae04f1c6ff 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -36,7 +36,7 @@ def setUp(self): self.bias = np.array([0.0, 1.0], dtype=np.float32) self.default_act_qtype = onnx.TensorProto.UINT8 self.default_wgt_qtype = onnx.TensorProto.UINT8 - self.default_wgt_qtype_per_channel = onnx.TensorProto.INT8 + self.default_wgt_qtype_per_channel = onnx.TensorProto.UINT8 self.default_bias_qtype = onnx.TensorProto.INT32 self.default_zp_scales = { @@ -49,7 +49,8 @@ def setUp(self): self.default_zp_scales_per_channel = { "INP": (0, np.float32(0.0235294122248888)), "SIG_OUT": (0, np.float32(0.003911871928721666)), - "WGT": ([0, 0], [np.float32(0.015748031437397003), np.float32(0.011811023578047752)]), + # per-channel weights are always symmetric (ie. zp = (qmin + qmax) / 2) + "WGT": ([127, 127], [np.float32(0.015748031437397003), np.float32(0.011811023578047752)]), "BIAS": ([0, 0], [np.float32(0.00006160428165458143), np.float32(0.00004620321124093607)]), "OUT": (0, np.float32(0.005075461231172085)), } @@ -420,12 +421,17 @@ def test_qdq_overrides_per_channel2(self): self.assertEqual(wgt_zp.data_type, quant_type.tensor_type) for index, (zp, scale) in enumerate(zip(wgt_zp.int32_data, wgt_sc.float_data)): - wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType(wgt_zp.data_type, reduce_range=reduce_range) + wgt_qmin, wgt_qmax = get_qmin_qmax_for_qType( + wgt_zp.data_type, + symmetric=True, # per-channel is always symmetric + reduce_range=reduce_range, + ) expected_zp, expected_scale = compute_scale_zp( np.array(rmin_vals[index], dtype=np.float32), np.array(rmax_vals[index], dtype=np.float32), wgt_qmin, wgt_qmax, + symmetric=True, # per-channel is always symmetric ) self.assertEqual(zp, expected_zp) self.assertEqual(scale, np.float32(expected_scale))