add int8 quantization support (#3058)

lanluo-nvidia · web-flow · commit b3a8cddc8ae8 · 2024-08-28T14:25:19.000-07:00
diff --git a/examples/dynamo/vgg16_ptq.py b/examples/dynamo/vgg16_ptq.py
@@ -1,10 +1,10 @@
 """
-.. _vgg16_fp8_ptq:
+.. _vgg16_ptq:
 
 Deploy Quantized Models using Torch-TensorRT
 ======================================================
 
-Here we demonstrate how to deploy a model quantized to FP8 using the Dynamo frontend of Torch-TensorRT
+Here we demonstrate how to deploy a model quantized to INT8 or FP8 using the Dynamo frontend of Torch-TensorRT
 """
 
 # %%
@@ -111,7 +111,12 @@ def vgg16(num_classes=1000, init_weights=False):
     type=int,
     help="Batch size for tuning the model with PTQ and FP8",
 )
-
+PARSER.add_argument(
+    "--quantize-type",
+    default="int8",
+    type=str,
+    help="quantization type, currently supported int8 or fp8 for PTQ",
+)
 args = PARSER.parse_args()
 
 model = vgg16(num_classes=10, init_weights=False)
@@ -191,8 +196,10 @@ def calibrate_loop(model):
 # %%
 # Tune the pre-trained model with FP8 and PTQ
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-quant_cfg = mtq.FP8_DEFAULT_CFG
+if args.quantize_type == "int8":
+    quant_cfg = mtq.INT8_DEFAULT_CFG
+elif args.quantize_type == "fp8":
+    quant_cfg = mtq.FP8_DEFAULT_CFG
 # PTQ with in-place replacement to quantized modules
 mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
 # model has FP8 qdq nodes at this point
@@ -226,11 +233,18 @@ def calibrate_loop(model):
     with export_torch_mode():
         # Compile the model with Torch-TensorRT Dynamo backend
         input_tensor = images.cuda()
-        exp_program = torch.export.export(model, (input_tensor,))
+        # torch.export.export() failed due to RuntimeError: Attempting to use FunctionalTensor on its own. Instead, please use it with a corresponding FunctionalTensorMode()
+        from torch.export._trace import _export
+
+        exp_program = _export(model, (input_tensor,))
+        if args.quantize_type == "int8":
+            enabled_precisions = {torch.int8}
+        elif args.quantize_type == "fp8":
+            enabled_precisions = {torch.float8_e4m3fn}
         trt_model = torchtrt.dynamo.compile(
             exp_program,
             inputs=[input_tensor],
-            enabled_precisions={torch.float8_e4m3fn},
+            enabled_precisions=enabled_precisions,
             min_block_size=1,
             debug=False,
         )
diff --git a/py/torch_tensorrt/_enums.py b/py/torch_tensorrt/_enums.py
@@ -5,11 +5,10 @@
 from typing import Any, Optional, Type, Union
 
 import numpy as np
+import tensorrt as trt
 import torch
 from torch_tensorrt._features import ENABLED_FEATURES, needs_torch_tensorrt_runtime
 
-import tensorrt as trt
-
 
 class dtype(Enum):
     """Enum to describe data types to Torch-TensorRT, has compatibility with torch, tensorrt and numpy dtypes"""
diff --git a/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py b/py/torch_tensorrt/dynamo/conversion/aten_ops_converters.py
@@ -606,28 +606,30 @@ def aten_ops_neg(
 try:
     import modelopt.torch.quantization as mtq  # noqa: F401
 
-    assert torch.ops.trt.quantize_fp8.default
+    assert torch.ops.tensorrt.quantize_op.default
 except Exception as e:
     _LOGGER.warning(
         "Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models"
     )
 else:
 
-    @dynamo_tensorrt_converter(torch.ops.trt.quantize_fp8.default)
-    def aten_ops_quantize_fp8(
+    @dynamo_tensorrt_converter(torch.ops.tensorrt.quantize_op.default)
+    def aten_ops_quantize_op(
         ctx: ConversionContext,
         target: Target,
         args: Tuple[Argument, ...],
         kwargs: Dict[str, Argument],
         name: str,
     ) -> Union[TRTTensor, Sequence[TRTTensor]]:
-        return impl.quantize.quantize_fp8(
+        return impl.quantize.quantize(
             ctx,
             target,
             SourceIR.ATEN,
             name,
             args[0],
             args[1],
+            args[2],
+            args[3],
         )
 
 
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -10,36 +10,54 @@
 from torch_tensorrt.fx.types import TRTTensor
 
 
-def quantize_fp8(
+def quantize(
     ctx: ConversionContext,
     target: Target,
     source_ir: Optional[SourceIR],
     name: str,
     input_tensor: TRTTensor,
-    scale: np.ndarray,
+    amax: np.ndarray,
+    num_bits: int,
+    exponent_bits: int,
 ) -> TRTTensor:
     """
     Adds quantize and dequantize ops (QDQ) which quantize to INT8 or FP8 based
     on the output_type set and dequantizes them back.
     """
-    if (isinstance(input_tensor, TRTTensor)) and not (
-        input_tensor.dtype == trt.float32 or input_tensor.dtype == trt.float16
+    if isinstance(input_tensor, TRTTensor) and input_tensor.dtype not in (
+        trt.float32,
+        trt.float16,
     ):
         raise ValueError(
-            f"quantize_fp8 converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16"
+            f"quantize converter received an input of {input_tensor.dtype} type. Supported types: float32 | float16"
         )
-
+    if num_bits != 8 or exponent_bits not in (0, 4):
+        raise ValueError(
+            f"quantize converter currently only accept INT8 or FP8 based quantize, got {num_bits=}, {exponent_bits=}"
+        )
+    if num_bits == 8 and exponent_bits == 0:
+        max_bound = 127
+    elif num_bits == 8 and exponent_bits == 4:
+        max_bound = 448
+    scale = np.divide(amax, max_bound)
     scale = get_trt_tensor(ctx, scale, name + "_scale")
     # Add Q node
     quantize_layer = ctx.net.add_quantize(input_tensor, scale)
-    quantize_layer.set_output_type(0, trt.DataType.FP8)
+    if num_bits == 8 and exponent_bits == 0:
+        quantize_layer.set_output_type(0, trt.DataType.INT8)
+    elif num_bits == 8 and exponent_bits == 4:
+        quantize_layer.set_output_type(0, trt.DataType.FP8)
+
     set_layer_name(quantize_layer, target, name + "_quantize", source_ir)
     q_output = quantize_layer.get_output(0)
     # Add DQ node
     dequantize_layer = ctx.net.add_dequantize(q_output, scale)
     set_layer_name(dequantize_layer, target, name + "_dequantize", source_ir)
-    # Set DQ layer precision to FP8
-    dequantize_layer.precision = trt.DataType.FP8
+    if num_bits == 8 and exponent_bits == 0:
+        dequantize_layer.precision = trt.DataType.INT8
+    elif num_bits == 8 and exponent_bits == 4:
+        # Set DQ layer precision to FP8
+        dequantize_layer.precision = trt.DataType.FP8
     dq_output = dequantize_layer.get_output(0)
 
     return dq_output
diff --git a/tests/py/dynamo/models/test_models_export.py b/tests/py/dynamo/models/test_models_export.py
@@ -1,6 +1,7 @@
 # type: ignore
 import unittest
 
+import modelopt
 import pytest
 import timm
 import torch
@@ -225,3 +226,52 @@ def calibrate_loop(model):
             )
             outputs_trt = trt_model(input_tensor)
             assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=1e-2)
+
+
+@unittest.skipIf(
+    modelopt.__version__ < "0.16.1",
+    "Int8 quantization is supported in modelopt since 0.16.1 or later",
+)
+@pytest.mark.unit
+def test_base_int8(ir):
+    class SimpleNetwork(torch.nn.Module):
+        def __init__(self):
+            super(SimpleNetwork, self).__init__()
+            self.linear1 = torch.nn.Linear(in_features=10, out_features=5)
+            self.linear2 = torch.nn.Linear(in_features=5, out_features=1)
+
+        def forward(self, x):
+            x = self.linear1(x)
+            x = torch.nn.ReLU()(x)
+            x = self.linear2(x)
+            return x
+
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.quantization.utils import export_torch_mode
+
+    def calibrate_loop(model):
+        """Simple calibration function for testing."""
+        model(input_tensor)
+
+    input_tensor = torch.randn(1, 10).cuda()
+    model = SimpleNetwork().eval().cuda()
+
+    quant_cfg = mtq.INT8_DEFAULT_CFG
+    mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    # model has INT8 qdq nodes at this point
+    output_pyt = model(input_tensor)
+
+    with torch.no_grad():
+        with export_torch_mode():
+            from torch.export._trace import _export
+
+            exp_program = _export(model, (input_tensor,))
+            trt_model = torchtrt.dynamo.compile(
+                exp_program,
+                inputs=[input_tensor],
+                enabled_precisions={torch.int8},
+                min_block_size=1,
+                debug=True,
+            )
+            outputs_trt = trt_model(input_tensor)
+            assert torch.allclose(output_pyt, outputs_trt, rtol=1e-3, atol=1e-2)
diff --git a/tests/py/requirements.txt b/tests/py/requirements.txt
@@ -9,4 +9,6 @@ pytest-xdist>=3.6.1
 pyyaml
 timm>=1.0.3
 transformers==4.40.2
+# TODO: once 0.16.1 is out, update it here
+nvidia-modelopt>=0.15.1
 --extra-index-url https://pypi.nvidia.com