pytorch · peri044 · Oct 11, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/docsrc/index.rst b/docsrc/index.rst
@@ -118,6 +118,8 @@ Tutorials
    tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2
    tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion
    tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
+   tutorials/_rendered_examples/dynamo/torch_export_gpt2
+   tutorials/_rendered_examples/dynamo/torch_export_llama2
 
 Python API Documentation
 ------------------------

diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst
@@ -1,19 +1,36 @@
 .. _torch_compile:
 
-Dynamo / ``torch.compile``
-----------------------------
+Torch-TensorRT Examples
+====================================
 
-Torch-TensorRT provides a backend for the new ``torch.compile`` API released in PyTorch 2.0. In the following examples we describe
-a number of ways you can leverage this backend to accelerate inference.
+Please refer to the following examples which demonstrate the usage of different features of Torch-TensorRT. We also provide
+examples of Torch-TensorRT compilation of select computer vision and language models.
 
-* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
-* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
+Dependencies
+------------------------------------
+
+Please install the following external dependencies (assuming you already have correct `torch`, `torch_tensorrt` and `tensorrt` libraries installed (`dependencies <https://github.com/pytorch/TensorRT?tab=readme-ov-file#dependencies>`_))
+
+.. code-block:: python
+
+    pip install -r requirements.txt
+
+
+Compiler Features
+------------------------------------
 * :ref:`torch_compile_advanced_usage`: Advanced usage including making a custom backend to use directly with the ``torch.compile`` API
-* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
 * :ref:`torch_export_cudagraphs`: Using the Cudagraphs integration with `ir="dynamo"`
 * :ref:`custom_kernel_plugins`: Creating a plugin to use a custom kernel inside TensorRT engines
 * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights
 * :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule
 * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile``
 * :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times
 * :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT
+
+Model Zoo
+------------------------------------
+* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
+* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
+* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
+* :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`)
+* :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`)
diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt
@@ -1,4 +1,4 @@
 cupy==13.1.0
-torch>=2.4.0.dev20240503+cu121
-torch-tensorrt>=2.4.0.dev20240503+cu121
 triton==2.3.0
+diffusers==0.30.3
+transformers==4.44.2
diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py
@@ -25,12 +25,16 @@
 # CPU is used here so that GPU memory is reserved for TRT compilation.
 with torch.no_grad():
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = AutoModelForCausalLM.from_pretrained(
-        "gpt2",
-        pad_token_id=tokenizer.eos_token_id,
-        use_cache=False,
-        attn_implementation="eager",
-    ).eval()
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            "gpt2",
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=False,
+            attn_implementation="eager",
+        )
+        .eval()
+        .half()
+    )
 
 # %%
 # Tokenize a sample input prompt and get pytorch model outputs
@@ -56,6 +60,8 @@
     truncate_double=True,
     device=DEVICE,
     disable_tf32=True,
+    use_strong_types=True,
+    use_fp32_acc=True,
 )
 
 # Auto-regressive generation loop for greedy decoding using TensorRT model
@@ -81,6 +87,10 @@
 # %%
 # The output sentences should look like
 # =============================
-# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# Pytorch model generated text:  What is parallel programming ?
+
+# The parallel programming paradigm is a set of programming languages that are designed to be used in parallel. The main difference between parallel programming and parallel programming is that
 # =============================
-# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# TensorRT model generated text:  What is parallel programming ?
+
+# The parallel programming paradigm is a set of programming languages that are designed to be used in parallel. The main difference between parallel programming and parallel programming is that
diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py
@@ -50,10 +50,11 @@
     llama2_ep,
     inputs=[input_ids],
     enabled_precisions={torch.float32},
-    min_block_size=1,
     truncate_double=True,
     device=DEVICE,
     disable_tf32=True,
+    use_strong_types=True,
+    use_fp32_acc=True,
 )
 
 # Auto-regressive generation loop for greedy decoding using TensorRT model
@@ -85,6 +86,6 @@
 # %%
 # The output sentences should look like
 # =============================
-# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# Pytorch model generated text:  Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and
 # =============================
-# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# TensorRT model generated text:  Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -88,6 +88,8 @@ def compile(
     engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
     custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE,
+    use_strong_types: bool = _defaults.USE_STRONG_TYPES,
+    use_fp32_acc: bool = _defaults.USE_FP32_ACC,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -158,6 +160,8 @@ def compile(
         engine_cache_dir (Optional[str]): Directory to store the cached TRT engines
         engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default
         custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
+        use_strong_types (bool): Enable strong typing in TensorRT compilation
+        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -197,6 +201,20 @@ def compile(
             "\nThis feature is unimplemented in Torch-TRT Dynamo currently."
         )
 
+    if use_strong_types:
+        if len(enabled_precisions) != 1 or not any(
+            x in enabled_precisions for x in {torch.float32, dtype.f32}
+        ):
+            raise AssertionError(
+                f"When use_strong_types is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}"
+            )
+
+    if use_fp32_acc:
+        logger.debug(
+            "FP32 accumulation for matmul layers is enabled. This option should only be enabled if the model already has FP16 weights and has no effect if it has FP32 weights. \
+                     This flag inserts casts around matmul layers and ensures TensorRT executes the matmul layers in FP16 with FP32 accumulation."
+        )
+
     # Aliasing inputs to arg_inputs for better understanding
     if not arg_inputs and not inputs:
         raise AssertionError("'arg_inputs' and 'inputs' should not both be None.")
@@ -232,7 +250,7 @@ def compile(
     logger.debug("Input graph: " + str(gm.graph))
 
     # Apply lowering on the graph module
-    gm = post_lowering(gm)
+    gm = post_lowering(gm, use_fp32_acc=use_fp32_acc)
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
     engine_cache = None
@@ -281,6 +299,8 @@ def compile(
         "lazy_engine_init": lazy_engine_init,
         "cache_built_engines": cache_built_engines,
         "reuse_cached_engines": reuse_cached_engines,
+        "use_strong_types": use_strong_types,
+        "use_fp32_acc": use_fp32_acc,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -520,6 +540,8 @@ def convert_exported_program_to_serialized_trt_engine(
     calibrator: object = None,
     allow_shape_tensors: bool = False,
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
+    use_strong_types: bool = _defaults.USE_STRONG_TYPES,
+    use_fp32_acc: bool = _defaults.USE_FP32_ACC,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -578,6 +600,8 @@ def convert_exported_program_to_serialized_trt_engine(
         calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration
         allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
+        use_strong_types (bool): Enable strong typing in TensorRT compilation
+        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
@@ -651,6 +675,8 @@ def convert_exported_program_to_serialized_trt_engine(
         "dla_local_dram_size": dla_local_dram_size,
         "dla_global_dram_size": dla_global_dram_size,
         "timing_cache_path": timing_cache_path,
+        "use_strong_types": use_strong_types,
+        "use_fp32_acc": use_fp32_acc,
     }
 
     exported_program = pre_export_lowering(exported_program)

diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -40,6 +40,8 @@
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
 ENGINE_CACHE_SIZE = 1073741824
 CUSTOM_ENGINE_CACHE = None
+USE_STRONG_TYPES = False
+USE_FP32_ACC = False
 
 
 def default_device() -> Device:

diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -30,7 +30,9 @@
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
     USE_FAST_PARTITIONER,
+    USE_FP32_ACC,
     USE_PYTHON_RUNTIME,
+    USE_STRONG_TYPES,
     VERSION_COMPATIBLE,
     WORKSPACE_SIZE,
     default_device,
@@ -78,6 +80,8 @@ class CompilationSettings:
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
         cache_built_engines (bool): Whether to save the compiled TRT engines to storage
         reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
+        use_strong_types (bool): Enable strong typing in TensorRT compilation
+        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -112,6 +116,8 @@ class CompilationSettings:
     lazy_engine_init: bool = LAZY_ENGINE_INIT
     cache_built_engines: bool = CACHE_BUILT_ENGINES
     reuse_cached_engines: bool = REUSE_CACHED_ENGINES
+    use_strong_types: bool = USE_STRONG_TYPES
+    use_fp32_acc: bool = USE_FP32_ACC
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (

diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -100,7 +100,7 @@ def _pretraced_backend(
 
             logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
-            gm = post_lowering(gm)
+            gm = post_lowering(gm, use_fp32_acc=settings.use_fp32_acc)
 
             logger.debug("Lowered Input graph:\n " + str(gm.graph))
 

diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -80,10 +80,11 @@ def __init__(
         self.builder = trt.Builder(self.logger)
 
         flag = 0
-
-        # It is deprecated to not use this flag
-        EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
-        flag |= EXPLICIT_BATCH
+        if compilation_settings.use_strong_types:
+            STRONGLY_TYPED = 1 << (int)(
+                trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED
+            )
+            flag |= STRONGLY_TYPED
 
         self.ctx = ConversionContext(
             self.builder.create_network(flag), compilation_settings

diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
@@ -3,6 +3,7 @@
 import logging
 from typing import Any, List, Optional, Sequence
 
+import tensorrt as trt
 import torch
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._Device import Device
@@ -18,8 +19,6 @@
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
 from torch_tensorrt.dynamo.utils import get_model_device, get_torch_inputs
 
-import tensorrt as trt
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/base.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/base.py
@@ -13,13 +13,11 @@
     broadcast_to_same_shape,
     cast_trt_tensor,
     get_trt_tensor,
-)
-from torch_tensorrt.fx.converters.converter_utils import (
     broadcast,
     has_dynamic_shape,
     set_layer_name,
 )
-from torch_tensorrt.fx.types import TRTElementWiseOp, TRTTensor
+from torch_tensorrt.dynamo.types import TRTElementWiseOp, TRTTensor
 
 
 def get_python_op_from_trt_elementwise_op(
@@ -152,7 +150,7 @@ def convert_binary_elementwise(
 
     if has_dynamic_shape(lhs_val.shape) or has_dynamic_shape(rhs_val.shape):
         lhs_val, rhs_val = broadcast(
-            ctx.net, lhs_val, rhs_val, f"{name}_broadcast_lhs", f"{name}_broadcast_rhs"
+            ctx, lhs_val, rhs_val, f"{name}_broadcast_lhs", f"{name}_broadcast_rhs"
         )
     else:
         lhs_val, rhs_val = broadcast_to_same_shape(

diff --git a/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py b/py/torch_tensorrt/dynamo/conversion/impl/elementwise/ops.py
@@ -18,14 +18,11 @@
 from torch_tensorrt.dynamo.conversion.impl.elementwise.base import (
     convert_binary_elementwise,
 )
-from torch_tensorrt.dynamo.conversion.impl.shape import get_shape_with_dynamic_shape
 from torch_tensorrt.dynamo.conversion.impl.unary import atan, sign
 from torch_tensorrt.dynamo.conversion.impl.unary.base import convert_unary
 from torch_tensorrt.fx.converters.converter_utils import broadcast
 from torch_tensorrt.fx.types import TRTTensor
 
-import tensorrt as trt
-
 
 def trunc_div(
     ctx: ConversionContext,
@@ -69,10 +66,12 @@ def trunc_div(
         prod_output,
     )
 
+    # TODO: This casting causes output divergence for llama2 in FP16.
+    # @apbose to investigate why this is needed and suggest alternatives.
     # cast the sign_output back to int32 for trunc div
     # This is required for scatter_reduce_.two(reduce='mean' where trunc_div casts it to float32 and TRTInterpreter expects int32)
-    if (isinstance(sign_output, TRTTensor)) and (sign_output.dtype == trt.float32):
-        sign_output = cast_trt_tensor(ctx, sign_output, trt.int32, name)
+    # if (isinstance(sign_output, TRTTensor)) and (sign_output.dtype == trt.float32):
+    #     sign_output = cast_trt_tensor(ctx, sign_output, trt.int32, name)
 
     # Convert constant input into ITensor for UnaryOperation
     if not isinstance(input, trt.tensorrt.ITensor):

diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -296,16 +296,21 @@ class ReduceOperation(Enum):
     AMAX = ("Amax reduce operation", lambda x, y: torch.max(x, y))
     AMIN = ("Amin reduce operation", lambda x, y: torch.min(x, y))
 
-    def __new__(cls, description, func):
+    def __new__(cls, description: Any, func: Any) -> Any:
         obj = object.__new__(cls)
         obj._value_ = auto()
         obj.description = description
         obj.func = func
         return obj
 
     def reduce_operation_with_scatter(
-        self, operation_lhs, initial_tensor, dim, index_tensor, src_tensor
-    ):
+        self,
+        operation_lhs: Any,
+        initial_tensor: torch.Tensor,
+        dim: int,
+        index_tensor: torch.Tensor,
+        src_tensor: torch.Tensor,
+    ) -> Any:
         scatter_tensor = None
         if self == ReduceOperation.SUM or self == ReduceOperation.MEAN:
             scatter_tensor = torch.zeros_like(initial_tensor)
@@ -341,7 +346,7 @@ def scatter_reduce_decomposition(
     scatter_count_tensor = torch.zeros_like(input_tensor)
     src_shape = list(src_tensor.shape)
     src_dim = src_shape[dim]
-    if include_self == False:
+    if not include_self:
         raise AssertionError("include_self False for scatter reduce not yet supported")
     for i in range(0, src_dim):
         src_slice = torch.select(src_tensor, dim, i)