pytorch · peri044 · Oct 11, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/docsrc/index.rst b/docsrc/index.rst
@@ -37,6 +37,7 @@ User Guide
 * :ref:`saving_models`
 * :ref:`runtime`
 * :ref:`using_dla`
+* :ref:`mixed_precision`
 
 .. toctree::
    :caption: User Guide
@@ -48,6 +49,7 @@ User Guide
    user_guide/saving_models
    user_guide/runtime
    user_guide/using_dla
+   user_guide/mixed_precision
    tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage
    tutorials/_rendered_examples/dynamo/vgg16_ptq
    tutorials/_rendered_examples/dynamo/engine_caching_example
@@ -118,6 +120,8 @@ Tutorials
    tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2
    tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion
    tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
+   tutorials/_rendered_examples/dynamo/torch_export_gpt2
+   tutorials/_rendered_examples/dynamo/torch_export_llama2
 
 Python API Documentation
 ------------------------

diff --git a/docsrc/user_guide/mixed_precision.rst b/docsrc/user_guide/mixed_precision.rst
@@ -0,0 +1,74 @@
+.. _mixed_precision:
+
+Compile Mixed Precision models with Torch-TensorRT
+====================================
+.. currentmodule:: torch_tensorrt.dynamo
+
+.. automodule:: torch_tensorrt.dynamo
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Consider the following Pytorch model which explicitly casts intermediate layer to run in FP16. 
+
+.. code-block:: python
+
+    class MyModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear1 = torch.nn.Linear(10,10)
+            self.linear2 = torch.nn.Linear(10,30).half()
+            self.linear3 = torch.nn.Linear(30,40)
+
+        def forward(self, x):
+            x = self.linear1(x)
+            x = x.to(torch.float16)
+            x = self.linear2(x)
+            x = x.to(torch.float32)
+            x = self.linear3(x)
+            return x
+
+
+If we compile the above model using Torch-TensorRT, layer profiling logs indicate that all the layers are 
+run in FP32. This is because TensorRT picks the kernels for layers which result in the best performance. 
+
+.. code-block:: python
+
+    inputs = [torch.randn((1, 10), dtype=torch.float32).cuda()]
+    mod = MyModule().eval().cuda()
+    ep = torch.export.export(mod, tuple(inputs))
+    with torch_tensorrt.logging.debug():
+        trt_gm = torch_tensorrt.dynamo.compile(ep, 
+                                            inputs=inputs, 
+                                            debug=True)
+
+    # Debug log info
+    # Layers:
+    # Name: __myl_MulSum_myl0_0, LayerType: kgen, Inputs: [ { Name: __mye116_dconst, Dimensions: [10,10], Format/Datatype: Float }, { Name: x, Dimensions: [10,1], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Float }], TacticName: __myl_MulSum_0xfa6c1858aea1b13b03f90165d7149ec6, StreamId: 0, Metadata: 
+    # Name: __myl_AddResMulSum_myl0_1, LayerType: kgen, Inputs: [ { Name: __mye131_dconst, Dimensions: [10,30], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Float }, { Name: linear1/addmm_constant_0 _ linear1/addmm_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,10], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], TacticName: __myl_AddResMulSum_0xb3915d7ebfe48be45b6d49083479e12f, StreamId: 0, Metadata: 
+    # Name: __myl_AddResMulSumAdd_myl0_2, LayerType: kgen, Inputs: [ { Name: __mye146_dconst, Dimensions: [30,40], Format/Datatype: Float }, { Name: linear3/addmm_2_constant_0 _ linear3/addmm_2_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,40], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }, { Name: linear2/addmm_1_constant_0 _ linear2/addmm_1_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,30], Format/Datatype: Float }], Outputs: [ { Name: output0, Dimensions: [1,40], Format/Datatype: Float }], TacticName: __myl_AddResMulSumAdd_0xcdd0085ad25f5f45ac5fafb72acbffd6, StreamId: 0, Metadata: 
+
+
+In order to respect the types specified by the user in the model (eg: in this case, ``linear2`` layer to run in FP16), users can enable 
+the compilation setting ``use_explicit_typing=True``. Compiling with this option results in the following TensorRT logs
+
+.. note:: If you enable ``use_explicit_typing=True``, only torch.float32 is supported in the enabled_precisions.
+
+.. code-block:: python
+
+    inputs = [torch.randn((1, 10), dtype=torch.float32).cuda()]
+    mod = MyModule().eval().cuda()
+    ep = torch.export.export(mod, tuple(inputs))
+    with torch_tensorrt.logging.debug():
+        trt_gm = torch_tensorrt.dynamo.compile(ep, 
+                                            inputs=inputs, 
+                                            use_explicit_typing=True
+                                            debug=True)
+
+    # Debug log info
+    # Layers:
+    # Name: __myl_MulSumAddCas_myl0_0, LayerType: kgen, Inputs: [ { Name: linear1/addmm_constant_0 _ linear1/addmm_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,10], Format/Datatype: Float }, { Name: __mye112_dconst, Dimensions: [10,10], Format/Datatype: Float }, { Name: x, Dimensions: [10,1], Format/Datatype: Float }], Outputs: [ { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Half }], TacticName: __myl_MulSumAddCas_0xacf8f5dd9be2f3e7bb09cdddeac6c936, StreamId: 0, Metadata: 
+    # Name: __myl_ResMulSumAddCas_myl0_1, LayerType: kgen, Inputs: [ { Name: __mye127_dconst, Dimensions: [10,30], Format/Datatype: Half }, { Name: linear2/addmm_1_constant_0 _ linear2/addmm_1_add_broadcast_to_same_shape_lhs_broadcast_constantHalf, Dimensions: [1,30], Format/Datatype: Half }, { Name: __myln_k_arg__bb1_2, Dimensions: [1,10], Format/Datatype: Half }], Outputs: [ { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], TacticName: __myl_ResMulSumAddCas_0x5a3b318b5a1c97b7d5110c0291481337, StreamId: 0, Metadata: 
+    # Name: __myl_ResMulSumAdd_myl0_2, LayerType: kgen, Inputs: [ { Name: __mye142_dconst, Dimensions: [30,40], Format/Datatype: Float }, { Name: linear3/addmm_2_constant_0 _ linear3/addmm_2_add_broadcast_to_same_shape_lhs_broadcast_constantFloat, Dimensions: [1,40], Format/Datatype: Float }, { Name: __myln_k_arg__bb1_3, Dimensions: [1,30], Format/Datatype: Float }], Outputs: [ { Name: output0, Dimensions: [1,40], Format/Datatype: Float }], TacticName: __myl_ResMulSumAdd_0x3fad91127c640fd6db771aa9cde67db0, StreamId: 0, Metadata: 
+
+Now the ``linear2`` layer runs in FP16 as shown in the above logs. 
diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst
@@ -1,19 +1,36 @@
 .. _torch_compile:
 
-Dynamo / ``torch.compile``
-----------------------------
+Torch-TensorRT Examples
+====================================
 
-Torch-TensorRT provides a backend for the new ``torch.compile`` API released in PyTorch 2.0. In the following examples we describe
-a number of ways you can leverage this backend to accelerate inference.
+Please refer to the following examples which demonstrate the usage of different features of Torch-TensorRT. We also provide
+examples of Torch-TensorRT compilation of select computer vision and language models.
 
-* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
-* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
+Dependencies
+------------------------------------
+
+Please install the following external dependencies (assuming you already have correct `torch`, `torch_tensorrt` and `tensorrt` libraries installed (`dependencies <https://github.com/pytorch/TensorRT?tab=readme-ov-file#dependencies>`_))
+
+.. code-block:: python
+
+    pip install -r requirements.txt
+
+
+Compiler Features
+------------------------------------
 * :ref:`torch_compile_advanced_usage`: Advanced usage including making a custom backend to use directly with the ``torch.compile`` API
-* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
 * :ref:`torch_export_cudagraphs`: Using the Cudagraphs integration with `ir="dynamo"`
 * :ref:`custom_kernel_plugins`: Creating a plugin to use a custom kernel inside TensorRT engines
 * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights
 * :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule
 * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile``
 * :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times
 * :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT
+
+Model Zoo
+------------------------------------
+* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
+* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
+* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
+* :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`)
+* :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`)
diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt
@@ -1,4 +1,4 @@
 cupy==13.1.0
-torch>=2.4.0.dev20240503+cu121
-torch-tensorrt>=2.4.0.dev20240503+cu121
 triton==2.3.0
+diffusers==0.30.3
+transformers==4.44.2
diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py
@@ -25,12 +25,16 @@
 # CPU is used here so that GPU memory is reserved for TRT compilation.
 with torch.no_grad():
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = AutoModelForCausalLM.from_pretrained(
-        "gpt2",
-        pad_token_id=tokenizer.eos_token_id,
-        use_cache=False,
-        attn_implementation="eager",
-    ).eval()
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            "gpt2",
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=False,
+            attn_implementation="eager",
+        )
+        .eval()
+        .half()
+    )
 
 # %%
 # Tokenize a sample input prompt and get pytorch model outputs
@@ -48,6 +52,10 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # Export the GPT2 model into an ExportedProgram which is input of TRT compilation
+# To compile the model in FP16, we do the following
+# 1) Cast the model to FP16 via model.half()
+# 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation
+# 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch)
 gpt2_ep = export_llm(model, input_ids, max_seq_len=1024)
 trt_model = torch_tensorrt.dynamo.compile(
     gpt2_ep,
@@ -56,6 +64,8 @@
     truncate_double=True,
     device=DEVICE,
     disable_tf32=True,
+    use_explicit_typing=True,
+    use_fp32_acc=True,
 )
 
 # Auto-regressive generation loop for greedy decoding using TensorRT model
@@ -81,6 +91,10 @@
 # %%
 # The output sentences should look like
 # =============================
-# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# Pytorch model generated text:  What is parallel programming ?
+
+# The parallel programming paradigm is a set of programming languages that are designed to be used in parallel. The main difference between parallel programming and parallel programming is that
 # =============================
-# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# TensorRT model generated text:  What is parallel programming ?
+
+# The parallel programming paradigm is a set of programming languages that are designed to be used in parallel. The main difference between parallel programming and parallel programming is that
diff --git a/examples/dynamo/torch_export_llama2.py b/examples/dynamo/torch_export_llama2.py
@@ -24,9 +24,13 @@
 # CPU is used here so that GPU memory is reserved for TRT compilation.
 llama_path = "meta-llama/Llama-2-7b-chat-hf"
 with torch.no_grad():
-    model = AutoModelForCausalLM.from_pretrained(
-        llama_path, use_cache=False, attn_implementation="eager"
-    ).eval()
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            llama_path, use_cache=False, attn_implementation="eager"
+        )
+        .eval()
+        .half()
+    )
 
 tokenizer = AutoTokenizer.from_pretrained(llama_path)
 
@@ -45,15 +49,20 @@
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 # Export the llama2 model into an ExportedProgram which is input of TRT compilation
+# To compile the model in FP16, we do the following
+# 1) Cast the model to FP16 via model.half()
+# 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation
+# 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch)
 llama2_ep = export_llm(model, input_ids, max_seq_len=64)
 trt_model = torch_tensorrt.dynamo.compile(
     llama2_ep,
     inputs=[input_ids],
     enabled_precisions={torch.float32},
-    min_block_size=1,
     truncate_double=True,
     device=DEVICE,
     disable_tf32=True,
+    use_explicit_typing=True,
+    use_fp32_acc=True,
 )
 
 # Auto-regressive generation loop for greedy decoding using TensorRT model
@@ -85,6 +94,6 @@
 # %%
 # The output sentences should look like
 # =============================
-# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# Pytorch model generated text:  Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and
 # =============================
-# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+# TensorRT model generated text:  Dynamic programming is an algorithmic technique used to solve complex problems by breaking them down into smaller subproblems, solving each subproblem only once, and
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -88,6 +88,8 @@ def compile(
     engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR,
     engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE,
     custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE,
+    use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING,
+    use_fp32_acc: bool = _defaults.USE_FP32_ACC,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -158,6 +160,8 @@ def compile(
         engine_cache_dir (Optional[str]): Directory to store the cached TRT engines
         engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default
         custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored.
+        use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs.
+        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -197,6 +201,20 @@ def compile(
             "\nThis feature is unimplemented in Torch-TRT Dynamo currently."
         )
 
+    if use_explicit_typing:
+        if len(enabled_precisions) != 1 or not any(
+            x in enabled_precisions for x in {torch.float32, dtype.f32}
+        ):
+            raise AssertionError(
+                f"When use_explicit_typing is enabled, only torch.float32 is allowed in the enabled_precisions but found {enabled_precisions}"
+            )
+
+    if use_fp32_acc:
+        logger.debug(
+            "FP32 accumulation for matmul layers is enabled. This option should only be enabled if the model already has FP16 weights and has no effect if it has FP32 weights. \
+                     This flag inserts casts around matmul layers and ensures TensorRT executes the matmul layers in FP16 with FP32 accumulation."
+        )
+
     # Aliasing inputs to arg_inputs for better understanding
     if not arg_inputs and not inputs:
         raise AssertionError("'arg_inputs' and 'inputs' should not both be None.")
@@ -232,7 +250,7 @@ def compile(
     logger.debug("Input graph: " + str(gm.graph))
 
     # Apply lowering on the graph module
-    gm = post_lowering(gm)
+    gm = post_lowering(gm, use_fp32_acc=use_fp32_acc)
     logger.debug("Lowered Input graph: " + str(gm.graph))
 
     engine_cache = None
@@ -281,6 +299,8 @@ def compile(
         "lazy_engine_init": lazy_engine_init,
         "cache_built_engines": cache_built_engines,
         "reuse_cached_engines": reuse_cached_engines,
+        "use_explicit_typing": use_explicit_typing,
+        "use_fp32_acc": use_fp32_acc,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -520,6 +540,8 @@ def convert_exported_program_to_serialized_trt_engine(
     calibrator: object = None,
     allow_shape_tensors: bool = False,
     timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
+    use_explicit_typing: bool = _defaults.USE_EXPLICIT_TYPING,
+    use_fp32_acc: bool = _defaults.USE_FP32_ACC,
     **kwargs: Any,
 ) -> bytes:
     """Convert an ExportedProgram to a serialized TensorRT engine
@@ -578,6 +600,8 @@ def convert_exported_program_to_serialized_trt_engine(
         calibrator (Union(torch_tensorrt._C.IInt8Calibrator, tensorrt.IInt8Calibrator)): Calibrator object which will provide data to the PTQ system for INT8 Calibration
         allow_shape_tensors: (Experimental) Allow aten::size to output shape tensors using IShapeLayer in TensorRT
         timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
+        use_explicit_typing (bool): This flag enables strong typing in TensorRT compilation which respects the precisions set in the Pytorch model. This is useful when users have mixed precision graphs.
+        use_fp32_acc (bool): This option inserts cast to FP32 nodes around matmul layers and TensorRT ensures the accumulation of matmul happens in FP32. Use this only when FP16 precision is configured in enabled_precisions.
     Returns:
         bytes: Serialized TensorRT engine, can either be saved to a file or deserialized via TensorRT APIs
     """
@@ -651,6 +675,8 @@ def convert_exported_program_to_serialized_trt_engine(
         "dla_local_dram_size": dla_local_dram_size,
         "dla_global_dram_size": dla_global_dram_size,
         "timing_cache_path": timing_cache_path,
+        "use_explicit_typing": use_explicit_typing,
+        "use_fp32_acc": use_fp32_acc,
     }
 
     exported_program = pre_export_lowering(exported_program)

diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -40,6 +40,8 @@
 ENGINE_CACHE_DIR = os.path.join(tempfile.gettempdir(), "torch_tensorrt_engine_cache")
 ENGINE_CACHE_SIZE = 1073741824
 CUSTOM_ENGINE_CACHE = None
+USE_EXPLICIT_TYPING = False
+USE_FP32_ACC = False
 
 
 def default_device() -> Device: