from typing import Any, List, Optional, Sequence, Tuple
import numpy as np
-import tensorrt as trt
import torch
from torch.export import ExportedProgram
from torch_tensorrt._enums import dtype
@@ -488,6 +489,7 @@ Source code for torch_tensorrt.dynamo._refit
)
from torch_tensorrt.dynamo.utils import (
check_module_output,
+ get_model_device,
get_torch_inputs,
set_log_level,
to_torch_device,
@@ -495,6 +497,8 @@ Source code for torch_tensorrt.dynamo._refit
)
from torch_tensorrt.logging import TRT_LOGGER
+import tensorrt as trt
+
logger = logging.getLogger(__name__)
@@ -599,7 +603,7 @@ Source code for torch_tensorrt.dynamo._refit
"""
refitted = set()
- torch_device = list(new_gm.state_dict().values())[0].device.type
+ torch_device = get_model_device(new_gm)
refitter = trt.Refitter(old_engine, TRT_LOGGER)
weight_list = refitter.get_all_weights()
diff --git a/docs/_modules/torch_tensorrt/dynamo/_settings.html b/docs/_modules/torch_tensorrt/dynamo/_settings.html
index 62a8669834..37fab4bc97 100644
--- a/docs/_modules/torch_tensorrt/dynamo/_settings.html
+++ b/docs/_modules/torch_tensorrt/dynamo/_settings.html
@@ -9,7 +9,7 @@
- torch_tensorrt.dynamo._settings — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation
+ torch_tensorrt.dynamo._settings — Torch-TensorRT v2.5.0.dev0+a4a9419 documentation
@@ -272,7 +272,7 @@
- v2.5.0.dev0+b3a8cdd
+ v2.5.0.dev0+a4a9419
@@ -313,6 +313,9 @@
Deploying Torch-TensorRT Programs
DLA
Torch Compile Advanced Usage
+
Deploy Quantized Models using Torch-TensorRT
+
Engine Caching
+
Refitting Torch-TensorRT Programs with New Weights
Dynamo Frontend
@@ -338,7 +341,6 @@
- Example notebooks
- Compiling ResNet using the Torch-TensorRT torch.compile Backend
- Compiling a Transformer using torch.compile and TensorRT
-- Torch Compile Advanced Usage
- Torch Compile Stable Diffusion
- Torch Export with Cudagraphs
- Using Custom Kernels within TensorRT Engines with Torch-TensorRT
@@ -459,6 +461,7 @@ Source code for torch_tensorrt.dynamo._settings
<
from torch_tensorrt._enums import EngineCapability, dtype
from torch_tensorrt.dynamo._defaults import (
ASSUME_DYNAMIC_SHAPE_SUPPORT,
+
CACHE_BUILT_ENGINES,
DEBUG,
DISABLE_TF32,
DLA_GLOBAL_DRAM_SIZE,
@@ -477,6 +480,7 @@
Source code for torch_tensorrt.dynamo._settings
<
OPTIMIZATION_LEVEL,
PASS_THROUGH_BUILD_FAILURES,
REQUIRE_FULL_COMPILATION,
+
REUSE_CACHED_ENGINES,
SPARSE_WEIGHTS,
TIMING_CACHE_PATH,
TRUNCATE_DOUBLE,
@@ -527,6 +531,8 @@
Source code for torch_tensorrt.dynamo._settings
<
output to a file if a string path is specified
hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer)
timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation
+
cache_built_engines (bool): Whether to save the compiled TRT engines to storage
+
reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage
"""
enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -558,7 +564,9 @@
Source code for torch_tensorrt.dynamo._settings
<
dryrun: Union[bool, str] = DRYRUN
hardware_compatible: bool = HARDWARE_COMPATIBLE
timing_cache_path: str = TIMING_CACHE_PATH
- lazy_engine_init: bool = LAZY_ENGINE_INIT
+
lazy_engine_init: bool = LAZY_ENGINE_INIT
+
cache_built_engines: bool = CACHE_BUILT_ENGINES
+
reuse_cached_engines: bool = REUSE_CACHED_ENGINES
diff --git a/docs/_modules/torch_tensorrt/dynamo/_tracer.html b/docs/_modules/torch_tensorrt/dynamo/_tracer.html
index aeda025161..0f0a03d1dc 100644
--- a/docs/_modules/torch_tensorrt/dynamo/_tracer.html
+++ b/docs/_modules/torch_tensorrt/dynamo/_tracer.html
@@ -9,7 +9,7 @@
-
torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation
+
torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.5.0.dev0+a4a9419 documentation
@@ -272,7 +272,7 @@
- v2.5.0.dev0+b3a8cdd
+ v2.5.0.dev0+a4a9419
@@ -313,6 +313,9 @@
- Deploying Torch-TensorRT Programs
- DLA
- Torch Compile Advanced Usage
+
- Deploy Quantized Models using Torch-TensorRT
+
- Engine Caching
+
- Refitting Torch-TensorRT Programs with New Weights
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
@@ -338,7 +341,6 @@
- Example notebooks
- Compiling ResNet using the Torch-TensorRT torch.compile Backend
- Compiling a Transformer using torch.compile and TensorRT
-- Torch Compile Advanced Usage
- Torch Compile Stable Diffusion
- Torch Export with Cudagraphs
- Using Custom Kernels within TensorRT Engines with Torch-TensorRT
diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt
index da5ee3d690..757acc2011 100644
--- a/docs/_sources/index.rst.txt
+++ b/docs/_sources/index.rst.txt
@@ -44,13 +44,14 @@ User Guide
:hidden:
user_guide/torch_tensorrt_explained
- user_guide/getting_started
user_guide/dynamic_shapes
user_guide/saving_models
user_guide/runtime
user_guide/using_dla
tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage
- tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq
+ tutorials/_rendered_examples/dynamo/vgg16_ptq
+ tutorials/_rendered_examples/dynamo/engine_caching_example
+ tutorials/_rendered_examples/dynamo/refit_engine_example
Dynamo Frontend
----------------
@@ -111,13 +112,11 @@ Tutorials
tutorials/notebooks
tutorials/_rendered_examples/dynamo/torch_compile_resnet_example
tutorials/_rendered_examples/dynamo/torch_compile_transformers_example
- tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage
tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion
tutorials/_rendered_examples/dynamo/torch_export_cudagraphs
tutorials/_rendered_examples/dynamo/custom_kernel_plugins
tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2
tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion
- tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq
tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
Python API Documentation
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt
new file mode 100644
index 0000000000..e72f42cfb2
--- /dev/null
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt
@@ -0,0 +1,127 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/_rendered_examples/dynamo/engine_caching_bert_example.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+ .. note::
+ :class: sphx-glr-download-link-note
+
+ :ref:`Go to the end `
+ to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py:
+
+
+.. _engine_caching_bert_example:
+
+Engine Caching (BERT)
+=======================
+
+Small caching example on BERT.
+
+.. GENERATED FROM PYTHON SOURCE LINES 10-76
+
+.. code-block:: python
+
+
+ import numpy as np
+ import torch
+ import torch_tensorrt
+ from engine_caching_example import remove_timing_cache
+ from transformers import BertModel
+
+ np.random.seed(0)
+ torch.manual_seed(0)
+
+ model = BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval()
+ inputs = [
+ torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
+ torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
+ ]
+
+
+ def compile_bert(iterations=3):
+ times = []
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+
+ # The 1st iteration is to measure the compilation time without engine caching
+ # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+ # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+ # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+ for i in range(iterations):
+ # remove timing cache and reset dynamo for engine caching messurement
+ remove_timing_cache()
+ torch._dynamo.reset()
+
+ if i == 0:
+ cache_built_engines = False
+ reuse_cached_engines = False
+ else:
+ cache_built_engines = True
+ reuse_cached_engines = True
+
+ start.record()
+ compilation_kwargs = {
+ "use_python_runtime": False,
+ "enabled_precisions": {torch.float},
+ "truncate_double": True,
+ "debug": False,
+ "min_block_size": 1,
+ "make_refitable": True,
+ "cache_built_engines": cache_built_engines,
+ "reuse_cached_engines": reuse_cached_engines,
+ "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache",
+ "engine_cache_size": 1 << 30, # 1GB
+ }
+ optimized_model = torch.compile(
+ model,
+ backend="torch_tensorrt",
+ options=compilation_kwargs,
+ )
+ optimized_model(*inputs)
+ end.record()
+ torch.cuda.synchronize()
+ times.append(start.elapsed_time(end))
+
+ print("-----compile bert-----> compilation time:\n", times, "milliseconds")
+
+
+ if __name__ == "__main__":
+ compile_bert()
+
+
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 0 minutes 0.000 seconds)
+
+
+.. _sphx_glr_download_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py:
+
+.. only:: html
+
+ .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+
+
+ .. container:: sphx-glr-download sphx-glr-download-python
+
+ :download:`Download Python source code: engine_caching_bert_example.py `
+
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+ :download:`Download Jupyter notebook: engine_caching_bert_example.ipynb `
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+ `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt
new file mode 100644
index 0000000000..df61bec65e
--- /dev/null
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt
@@ -0,0 +1,361 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/_rendered_examples/dynamo/engine_caching_example.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+ .. note::
+ :class: sphx-glr-download-link-note
+
+ :ref:`Go to the end `
+ to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_example.py:
+
+
+.. _engine_caching_example:
+
+Engine Caching
+=======================
+
+As model sizes increase, the cost of compilation will as well. With AOT methods
+like ``torch.dynamo.compile``, this cost is paid upfront. However if the weights
+change, the session ends or you are using JIT methods like ``torch.compile``, as
+graphs get invalidated they get re-compiled, this cost will get paid repeatedly.
+Engine caching is a way to mitigate this cost by saving constructed engines to disk
+and re-using them when possible. This tutorial demonstrates how to use engine caching
+with TensorRT in PyTorch. Engine caching can significantly speed up subsequent model
+compilations reusing previously built TensorRT engines.
+
+We'll explore two approaches:
+
+ 1. Using torch_tensorrt.dynamo.compile
+ 2. Using torch.compile with the TensorRT backend
+
+The example uses a pre-trained ResNet18 model and shows the
+differences between compilation without caching, with caching enabled,
+and when reusing cached engines.
+
+.. GENERATED FROM PYTHON SOURCE LINES 26-52
+
+.. code-block:: python
+
+
+ import os
+ from typing import Dict, Optional
+
+ import numpy as np
+ import torch
+ import torch_tensorrt as torch_trt
+ import torchvision.models as models
+ from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
+ from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
+
+ np.random.seed(0)
+ torch.manual_seed(0)
+
+ model = models.resnet18(pretrained=True).eval().to("cuda")
+ enabled_precisions = {torch.float}
+ debug = False
+ min_block_size = 1
+ use_python_runtime = False
+
+
+ def remove_timing_cache(path=TIMING_CACHE_PATH):
+ if os.path.exists(path):
+ os.remove(path)
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 53-67
+
+Engine Caching for JIT Compilation
+----------------------------------
+
+The primary goal of engine caching is to help speed up JIT workflows. ``torch.compile``
+provides a great deal of flexibility in model construction which makes it a good
+first tool to try when looking to speed up your workflow. However, historically
+the cost of compilation and in particular recompilation has been a barrier to entry
+for many users. If for some reason a subgraph gets invalidated, that graph is reconstructed
+scratch prior to the addition of engine caching. Now as engines are constructed, with ``cache_built_engines=True``,
+engines are saved to disk tied to a hash of their corresponding PyTorch subgraph. If
+in a subsequent compilation, either as part of this session or a new session, the cache will
+pull the built engine and **refit** the weights which can reduce compilation times by orders of magnitude.
+As such, in order to insert a new engine into the cache (i.e. ``cache_built_engines=True``),
+the engine must be refitable (``make_refittable=True``). See :ref:`refit_engine_example` for more details.
+
+.. GENERATED FROM PYTHON SOURCE LINES 67-118
+
+.. code-block:: python
+
+
+
+ def torch_compile(iterations=3):
+ times = []
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+
+ # The 1st iteration is to measure the compilation time without engine caching
+ # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+ # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+ # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+ for i in range(iterations):
+ inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+ # remove timing cache and reset dynamo just for engine caching messurement
+ remove_timing_cache()
+ torch._dynamo.reset()
+
+ if i == 0:
+ cache_built_engines = False
+ reuse_cached_engines = False
+ else:
+ cache_built_engines = True
+ reuse_cached_engines = True
+
+ start.record()
+ compiled_model = torch.compile(
+ model,
+ backend="tensorrt",
+ options={
+ "use_python_runtime": True,
+ "enabled_precisions": enabled_precisions,
+ "debug": debug,
+ "min_block_size": min_block_size,
+ "make_refitable": True,
+ "cache_built_engines": cache_built_engines,
+ "reuse_cached_engines": reuse_cached_engines,
+ },
+ )
+ compiled_model(*inputs) # trigger the compilation
+ end.record()
+ torch.cuda.synchronize()
+ times.append(start.elapsed_time(end))
+
+ print("----------------torch_compile----------------")
+ print("disable engine caching, used:", times[0], "ms")
+ print("enable engine caching to cache engines, used:", times[1], "ms")
+ print("enable engine caching to reuse engines, used:", times[2], "ms")
+
+
+ torch_compile()
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 119-124
+
+Engine Caching for AOT Compilation
+----------------------------------
+Similarly to the JIT workflow, AOT workflows can benefit from engine caching.
+As the same architecture or common subgraphs get recompiled, the cache will pull
+previously built engines and refit the weights.
+
+.. GENERATED FROM PYTHON SOURCE LINES 124-178
+
+.. code-block:: python
+
+
+
+ def dynamo_compile(iterations=3):
+ times = []
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+
+ example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
+ # Mark the dim0 of inputs as dynamic
+ batch = torch.export.Dim("batch", min=1, max=200)
+ exp_program = torch.export.export(
+ model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
+ )
+
+ # The 1st iteration is to measure the compilation time without engine caching
+ # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+ # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+ # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+ for i in range(iterations):
+ inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
+ remove_timing_cache() # remove timing cache just for engine caching messurement
+ if i == 0:
+ cache_built_engines = False
+ reuse_cached_engines = False
+ else:
+ cache_built_engines = True
+ reuse_cached_engines = True
+
+ start.record()
+ trt_gm = torch_trt.dynamo.compile(
+ exp_program,
+ tuple(inputs),
+ use_python_runtime=use_python_runtime,
+ enabled_precisions=enabled_precisions,
+ debug=debug,
+ min_block_size=min_block_size,
+ make_refitable=True,
+ cache_built_engines=cache_built_engines,
+ reuse_cached_engines=reuse_cached_engines,
+ engine_cache_size=1 << 30, # 1GB
+ )
+ # output = trt_gm(*inputs)
+ end.record()
+ torch.cuda.synchronize()
+ times.append(start.elapsed_time(end))
+
+ print("----------------dynamo_compile----------------")
+ print("disable engine caching, used:", times[0], "ms")
+ print("enable engine caching to cache engines, used:", times[1], "ms")
+ print("enable engine caching to reuse engines, used:", times[2], "ms")
+
+
+ dynamo_compile()
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 179-195
+
+Custom Engine Cache
+----------------------
+
+By default, the engine cache is stored in the system's temporary directory. Both the cache directory and
+size limit can be customized by passing ``engine_cache_dir`` and ``engine_cache_size``.
+Users can also define their own engine cache implementation by extending the ``BaseEngineCache`` class.
+This allows for remote or shared caching if so desired.
+
+The custom engine cache should implement the following methods:
+ - ``save``: Save the engine blob to the cache.
+ - ``load``: Load the engine blob from the cache.
+
+The hash provided by the cache systen is a weight agnostic hash of the originating PyTorch subgraph (post lowering).
+The blob contains a serialized engine, calling spec data, and weight map information in the pickle format
+
+Below is an example of a custom engine cache implementation that implents a ``RAMEngineCache``.
+
+.. GENERATED FROM PYTHON SOURCE LINES 195-289
+
+.. code-block:: python
+
+
+
+ class RAMEngineCache(BaseEngineCache):
+ def __init__(
+ self,
+ ) -> None:
+ """
+ Constructs a user held engine cache in memory.
+ """
+ self.engine_cache: Dict[str, bytes] = {}
+
+ def save(
+ self,
+ hash: str,
+ blob: bytes,
+ ):
+ """
+ Insert the engine blob to the cache.
+
+ Args:
+ hash (str): The hash key to associate with the engine blob.
+ blob (bytes): The engine blob to be saved.
+
+ Returns:
+ None
+ """
+ self.engine_cache[hash] = blob
+
+ def load(self, hash: str) -> Optional[bytes]:
+ """
+ Load the engine blob from the cache.
+
+ Args:
+ hash (str): The hash key of the engine to load.
+
+ Returns:
+ Optional[bytes]: The engine blob if found, None otherwise.
+ """
+ if hash in self.engine_cache:
+ return self.engine_cache[hash]
+ else:
+ return None
+
+
+ def torch_compile_my_cache(iterations=3):
+ times = []
+ engine_cache = RAMEngineCache()
+ start = torch.cuda.Event(enable_timing=True)
+ end = torch.cuda.Event(enable_timing=True)
+
+ # The 1st iteration is to measure the compilation time without engine caching
+ # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
+ # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
+ # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
+ for i in range(iterations):
+ inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
+ # remove timing cache and reset dynamo just for engine caching messurement
+ remove_timing_cache()
+ torch._dynamo.reset()
+
+ if i == 0:
+ cache_built_engines = False
+ reuse_cached_engines = False
+ else:
+ cache_built_engines = True
+ reuse_cached_engines = True
+
+ start.record()
+ compiled_model = torch.compile(
+ model,
+ backend="tensorrt",
+ options={
+ "use_python_runtime": True,
+ "enabled_precisions": enabled_precisions,
+ "debug": debug,
+ "min_block_size": min_block_size,
+ "make_refitable": True,
+ "cache_built_engines": cache_built_engines,
+ "reuse_cached_engines": reuse_cached_engines,
+ "custom_engine_cache": engine_cache,
+ },
+ )
+ compiled_model(*inputs) # trigger the compilation
+ end.record()
+ torch.cuda.synchronize()
+ times.append(start.elapsed_time(end))
+
+ print("----------------torch_compile----------------")
+ print("disable engine caching, used:", times[0], "ms")
+ print("enable engine caching to cache engines, used:", times[1], "ms")
+ print("enable engine caching to reuse engines, used:", times[2], "ms")
+
+
+ torch_compile_my_cache()
+
+
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 0 minutes 0.000 seconds)
+
+
+.. _sphx_glr_download_tutorials__rendered_examples_dynamo_engine_caching_example.py:
+
+.. only:: html
+
+ .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+
+
+ .. container:: sphx-glr-download sphx-glr-download-python
+
+ :download:`Download Python source code: engine_caching_example.py `
+
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+ :download:`Download Jupyter notebook: engine_caching_example.ipynb `
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+ `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt
index 6e5917ae7b..64ecdc59aa 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt
@@ -19,6 +19,8 @@ a number of ways you can leverage this backend to accelerate inference.
* :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights
* :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule
* :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile``
+* :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times
+* :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT
@@ -61,6 +63,23 @@ a number of ways you can leverage this backend to accelerate inference.
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py`
+
+.. raw:: html
+
+
Refitting Torch-TensorRT Programs with New Weights
+
+
+
.. raw:: html
@@ -80,18 +99,18 @@ a number of ways you can leverage this backend to accelerate inference.
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_gpt2_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py`
.. raw:: html
-
Refit TenorRT Graph Module with Torch-TensorRT
+
Compiling GPT2 using the Torch-TensorRT with dynamo backend
@@ -112,6 +131,40 @@ a number of ways you can leverage this backend to accelerate inference.
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_llama2_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py`
+
+.. raw:: html
+
+
Compiling Llama2 using the Torch-TensorRT with dynamo backend
+
+
+
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py`
+
+.. raw:: html
+
+
Engine Caching (BERT)
+
+
+
.. raw:: html
@@ -163,6 +216,23 @@ a number of ways you can leverage this backend to accelerate inference.
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_example_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_example.py`
+
+.. raw:: html
+
+
Engine Caching
+
+
+
.. raw:: html
@@ -190,11 +260,15 @@ a number of ways you can leverage this backend to accelerate inference.
/tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion
/tutorials/_rendered_examples/dynamo/torch_export_cudagraphs
- /tutorials/_rendered_examples/dynamo/torch_compile_transformers_example
/tutorials/_rendered_examples/dynamo/refit_engine_example
+ /tutorials/_rendered_examples/dynamo/torch_compile_transformers_example
+ /tutorials/_rendered_examples/dynamo/torch_export_gpt2
/tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage
+ /tutorials/_rendered_examples/dynamo/torch_export_llama2
+ /tutorials/_rendered_examples/dynamo/engine_caching_bert_example
/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
/tutorials/_rendered_examples/dynamo/torch_compile_resnet_example
/tutorials/_rendered_examples/dynamo/vgg16_ptq
+ /tutorials/_rendered_examples/dynamo/engine_caching_example
/tutorials/_rendered_examples/dynamo/custom_kernel_plugins
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt
index cc0b9fd21e..fb48bc8536 100644
--- a/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt
@@ -20,31 +20,38 @@
.. _refit_engine_example:
-Refit TenorRT Graph Module with Torch-TensorRT
+Refitting Torch-TensorRT Programs with New Weights
===================================================================
-We are going to demonstrate how a compiled TensorRT Graph Module can be refitted with updated weights.
-
-In many cases, we frequently update the weights of models, such as applying various LoRA to Stable Diffusion or constant A/B testing of AI products.
-That poses challenges for TensorRT inference optimizations, as compiling the TensorRT engines takes significant time, making repetitive compilation highly inefficient.
-Torch-TensorRT supports refitting TensorRT graph modules without re-compiling the engine, considerably accelerating the workflow.
+Compilation is an expensive operation as it involves many graph transformations, translations
+and optimizations applied on the model. In cases were the weights of a model might be updated
+occasionally (e.g. inserting LoRA adapters), the large cost of recompilation can make it infeasible
+to use TensorRT if the compiled program needed to be built from scratch each time. Torch-TensorRT
+provides a PyTorch native mechanism to update the weights of a compiled TensorRT program without
+recompiling from scratch through weight refitting.
In this tutorial, we are going to walk through
-1. Compiling a PyTorch model to a TensorRT Graph Module
-2. Save and load a graph module
-3. Refit the graph module
-.. GENERATED FROM PYTHON SOURCE LINES 20-22
+ 1. Compiling a PyTorch model to a TensorRT Graph Module
+ 2. Save and load a graph module
+ 3. Refit the graph module
+
+This tutorial focuses mostly on the AOT workflow where it is most likely that a user might need to
+manually refit a module. In the JIT workflow, weight changes trigger recompilation. As the engine
+has previously been built, with an engine cache enabled, Torch-TensorRT can automatically recognize
+a previously built engine, trigger refit and short cut recompilation on behalf of the user (see: :ref:`engine_caching_example`).
+
+.. GENERATED FROM PYTHON SOURCE LINES 27-29
Standard Workflow
-----------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 24-26
+.. GENERATED FROM PYTHON SOURCE LINES 31-33
Imports and model definition
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-.. GENERATED FROM PYTHON SOURCE LINES 26-38
+.. GENERATED FROM PYTHON SOURCE LINES 33-45
.. code-block:: python
@@ -61,17 +68,24 @@ Imports and model definition
-.. GENERATED FROM PYTHON SOURCE LINES 39-41
+.. GENERATED FROM PYTHON SOURCE LINES 46-55
-Compile the module for the first time and save it.
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Make a Refitable Compilation Program
+---------------------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 41-66
+The inital step is to compile a module and save it as with a normal. Note that there is an
+additional parameter `make_refitable` that is set to `True`. This parameter is used to
+indicate that the engine being built should support weight refitting later. Engines built without
+these setttings will not be able to be refit.
+
+In this case we are going to compile a ResNet18 model with randomly initialized weights and save it.
+
+.. GENERATED FROM PYTHON SOURCE LINES 55-79
.. code-block:: python
- model = models.resnet18(pretrained=True).eval().to("cuda")
+ model = models.resnet18(pretrained=False).eval().to("cuda")
exp_program = torch.export.export(model, tuple(inputs))
enabled_precisions = {torch.float}
debug = False
@@ -91,23 +105,27 @@ Compile the module for the first time and save it.
) # Output is a torch.fx.GraphModule
# Save the graph module as an exported program
- # This is only supported when use_python_runtime = False
torch_trt.save(trt_gm, "./compiled.ep", inputs=inputs)
-.. GENERATED FROM PYTHON SOURCE LINES 67-69
+.. GENERATED FROM PYTHON SOURCE LINES 80-87
-Refit the module with update model weights
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Refit the Program with Pretrained Weights
+------------------------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 69-93
+Random weights are not useful for inference. But now instead of recompiling the model, we can
+refit the model with the pretrained weights. This is done by setting up another PyTorch module
+with the target weights and exporting it as an ExportedProgram. Then the ``refit_module_weights``
+function is used to update the weights of the compiled module with the new weights.
+
+.. GENERATED FROM PYTHON SOURCE LINES 87-111
.. code-block:: python
# Create and compile the updated model
- model2 = models.resnet18(pretrained=False).eval().to("cuda")
+ model2 = models.resnet18(pretrained=True).eval().to("cuda")
exp_program2 = torch.export.export(model2, tuple(inputs))
@@ -130,18 +148,36 @@ Refit the module with update model weights
print("Refit successfully!")
-.. GENERATED FROM PYTHON SOURCE LINES 94-96
+.. GENERATED FROM PYTHON SOURCE LINES 112-140
-Alternative Workflow using Python Runtime
+Advanced Usage
-----------------------------
-.. GENERATED FROM PYTHON SOURCE LINES 96-99
+There are a number of settings you can use to control the refit process
-.. code-block:: python
+Weight Map Cache
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Weight refitting works by matching the weights of the compiled module with the new weights from
+the user supplied ExportedProgram. Since 1:1 name matching from PyTorch to TensorRT is hard to accomplish,
+the only gaurenteed way to match weights at *refit-time* is to pass the new ExportedProgram through the
+early phases of the compilation process to generate near identical weight names. This can be expensive
+and is not always necessary.
+To avoid this, **At initial compile**, Torch-TensorRt will attempt to cache a direct mapping from PyTorch
+weights to TensorRT weights. This cache is stored in the compiled module as metadata and can be used
+to speed up refit. If the cache is not present, the refit system will fallback to rebuilding the mapping at
+refit-time. Use of this cache is controlled by the ``use_weight_map_cache`` parameter.
+
+Since the cache uses a heuristic based system for matching PyTorch and TensorRT weights, you may want to verify the refitting. This can be done by setting
+``verify_output`` to True and providing sample ``arg_inputs`` and ``kwarg_inputs``. When this is done, the refit
+system will run the refitted module and the user supplied module on the same inputs and compare the outputs.
+
+In-Place Refit
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- # Currently python runtime does not support engine serialization. So the refitting will be done in the same runtime.
- # This usecase is more useful when you need to switch different weights in the same runtime, such as using Stable Diffusion.
+``in_place`` allows the user to refit the module in place. This is useful when the user wants to update the weights
+of the compiled module without creating a new module.
.. rst-class:: sphx-glr-timing
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_gpt2.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_gpt2.rst.txt
new file mode 100644
index 0000000000..be90efc337
--- /dev/null
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_gpt2.rst.txt
@@ -0,0 +1,168 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/_rendered_examples/dynamo/torch_export_gpt2.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+ .. note::
+ :class: sphx-glr-download-link-note
+
+ :ref:`Go to the end `
+ to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py:
+
+
+.. _torch_export_gpt2:
+
+Compiling GPT2 using the Torch-TensorRT with dynamo backend
+==========================================================
+
+This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a GPT2 model.
+
+.. GENERATED FROM PYTHON SOURCE LINES 10-12
+
+Imports and Model Definition
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. GENERATED FROM PYTHON SOURCE LINES 12-17
+
+.. code-block:: python
+
+ import torch
+ import torch_tensorrt
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+ from utils import export_llm, generate
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 18-35
+
+.. code-block:: python
+
+
+ # Define the parameters and initialize the model
+ MAX_TOKENS = 32
+ DEVICE = torch.device("cuda:0")
+
+ # Define the GPT2 model from hugging face
+ # kv_cache is not supported in Torch-TRT currently.
+ # CPU is used here so that GPU memory is reserved for TRT compilation.
+ with torch.no_grad():
+ tokenizer = AutoTokenizer.from_pretrained("gpt2")
+ model = AutoModelForCausalLM.from_pretrained(
+ "gpt2",
+ pad_token_id=tokenizer.eos_token_id,
+ use_cache=False,
+ attn_implementation="eager",
+ ).eval()
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 36-37
+
+Tokenize a sample input prompt and get pytorch model outputs
+
+.. GENERATED FROM PYTHON SOURCE LINES 37-46
+
+.. code-block:: python
+
+ prompt = "I enjoy walking with my cute dog"
+ model_inputs = tokenizer(prompt, return_tensors="pt")
+ input_ids = model_inputs["input_ids"]
+
+ # Auto-regressive generation loop for greedy decoding using PyTorch model
+ # We use a custom generate function which is very similar to the huggingface one.
+ pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 47-49
+
+Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. GENERATED FROM PYTHON SOURCE LINES 49-67
+
+.. code-block:: python
+
+
+ # Export the GPT2 model into an ExportedProgram which is input of TRT compilation
+ gpt2_ep = export_llm(model, input_ids, max_seq_len=1024)
+ trt_model = torch_tensorrt.dynamo.compile(
+ gpt2_ep,
+ inputs=[input_ids],
+ enabled_precisions={torch.float32},
+ truncate_double=True,
+ device=DEVICE,
+ disable_tf32=True,
+ )
+
+ # Auto-regressive generation loop for greedy decoding using TensorRT model
+ # We use a custom generate function which is very similar to the huggingface one.
+ # Move inputs to GPU
+ input_ids = input_ids.to(DEVICE)
+ trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 68-70
+
+Decode the output sentences of PyTorch and TensorRT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. GENERATED FROM PYTHON SOURCE LINES 70-81
+
+.. code-block:: python
+
+ print("=============================")
+ print(
+ "Pytorch model generated text: ",
+ tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
+ )
+ print("=============================")
+ print(
+ "TensorRT model generated text: ",
+ tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
+ )
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 82-87
+
+The output sentences should look like
+=============================
+Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+=============================
+TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+
+
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 0 minutes 0.000 seconds)
+
+
+.. _sphx_glr_download_tutorials__rendered_examples_dynamo_torch_export_gpt2.py:
+
+.. only:: html
+
+ .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+
+
+ .. container:: sphx-glr-download sphx-glr-download-python
+
+ :download:`Download Python source code: torch_export_gpt2.py `
+
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+ :download:`Download Jupyter notebook: torch_export_gpt2.ipynb `
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+ `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_llama2.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_llama2.rst.txt
new file mode 100644
index 0000000000..5e66a72aab
--- /dev/null
+++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_llama2.rst.txt
@@ -0,0 +1,175 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "tutorials/_rendered_examples/dynamo/torch_export_llama2.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+ .. note::
+ :class: sphx-glr-download-link-note
+
+ :ref:`Go to the end `
+ to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py:
+
+
+.. _torch_export_llama2:
+
+Compiling Llama2 using the Torch-TensorRT with dynamo backend
+==========================================================
+
+This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a Llama2 model.
+
+.. GENERATED FROM PYTHON SOURCE LINES 10-12
+
+Imports and Model Definition
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. GENERATED FROM PYTHON SOURCE LINES 12-17
+
+.. code-block:: python
+
+ import torch
+ import torch_tensorrt
+ from transformers import AutoModelForCausalLM, AutoTokenizer
+ from utils import export_llm, generate
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 18-19
+
+Define the parameters and initialize the model
+
+.. GENERATED FROM PYTHON SOURCE LINES 19-33
+
+.. code-block:: python
+
+ MAX_TOKENS = 32
+ DEVICE = torch.device("cuda:0")
+
+ # Define the Llama2 model from hugging face
+ # kv_cache is not supported in Torch-TRT currently.
+ # CPU is used here so that GPU memory is reserved for TRT compilation.
+ llama_path = "meta-llama/Llama-2-7b-chat-hf"
+ with torch.no_grad():
+ model = AutoModelForCausalLM.from_pretrained(
+ llama_path, use_cache=False, attn_implementation="eager"
+ ).eval()
+
+ tokenizer = AutoTokenizer.from_pretrained(llama_path)
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 34-35
+
+Tokenize a sample input prompt and get pytorch model outputs
+
+.. GENERATED FROM PYTHON SOURCE LINES 35-43
+
+.. code-block:: python
+
+ prompt = "What is dynamic programming?"
+ model_inputs = tokenizer(prompt, return_tensors="pt")
+ input_ids = model_inputs.input_ids
+
+ # Auto-regressive generation loop for greedy decoding using PyTorch model
+ # We use a custom generate function which is very similar to the huggingface one.
+ pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 44-46
+
+Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. GENERATED FROM PYTHON SOURCE LINES 46-65
+
+.. code-block:: python
+
+
+ # Export the llama2 model into an ExportedProgram which is input of TRT compilation
+ llama2_ep = export_llm(model, input_ids, max_seq_len=64)
+ trt_model = torch_tensorrt.dynamo.compile(
+ llama2_ep,
+ inputs=[input_ids],
+ enabled_precisions={torch.float32},
+ min_block_size=1,
+ truncate_double=True,
+ device=DEVICE,
+ disable_tf32=True,
+ )
+
+ # Auto-regressive generation loop for greedy decoding using TensorRT model
+ # We use a custom generate function which is very similar to the huggingface one.
+ # Move inputs to GPU
+ input_ids = input_ids.to(DEVICE)
+ trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 66-68
+
+Decode the output sentences of PyTorch and TensorRT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. GENERATED FROM PYTHON SOURCE LINES 68-85
+
+.. code-block:: python
+
+ print("=============================")
+ print(
+ "Pytorch model generated text: ",
+ tokenizer.batch_decode(
+ pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False
+ )[0],
+ )
+ print("=============================")
+ print(
+ "TensorRT model generated text: ",
+ tokenizer.batch_decode(
+ trt_gen_tokens,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=False,
+ )[0],
+ )
+
+
+.. GENERATED FROM PYTHON SOURCE LINES 86-91
+
+The output sentences should look like
+=============================
+Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+=============================
+TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my
+
+
+.. rst-class:: sphx-glr-timing
+
+ **Total running time of the script:** ( 0 minutes 0.000 seconds)
+
+
+.. _sphx_glr_download_tutorials__rendered_examples_dynamo_torch_export_llama2.py:
+
+.. only:: html
+
+ .. container:: sphx-glr-footer sphx-glr-footer-example
+
+
+
+
+ .. container:: sphx-glr-download sphx-glr-download-python
+
+ :download:`Download Python source code: torch_export_llama2.py `
+
+ .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+ :download:`Download Jupyter notebook: torch_export_llama2.ipynb `
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+ `Gallery generated by Sphinx-Gallery `_
diff --git a/docs/_sources/tutorials/_rendered_examples/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/index.rst.txt
index f68c1fb417..c688d7370f 100644
--- a/docs/_sources/tutorials/_rendered_examples/index.rst.txt
+++ b/docs/_sources/tutorials/_rendered_examples/index.rst.txt
@@ -35,6 +35,8 @@ a number of ways you can leverage this backend to accelerate inference.
* :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights
* :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule
* :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile``
+* :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times
+* :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT
@@ -77,6 +79,23 @@ a number of ways you can leverage this backend to accelerate inference.
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py`
+
+.. raw:: html
+
+
Refitting Torch-TensorRT Programs with New Weights
+
+
+
.. raw:: html
@@ -96,18 +115,18 @@ a number of ways you can leverage this backend to accelerate inference.
.. raw:: html
-
+
.. only:: html
- .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_gpt2_thumb.png
:alt:
- :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py`
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py`
.. raw:: html
-
Refit TenorRT Graph Module with Torch-TensorRT
+
Compiling GPT2 using the Torch-TensorRT with dynamo backend
@@ -128,6 +147,40 @@ a number of ways you can leverage this backend to accelerate inference.
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_llama2_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py`
+
+.. raw:: html
+
+
Compiling Llama2 using the Torch-TensorRT with dynamo backend
+
+
+
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py`
+
+.. raw:: html
+
+
Engine Caching (BERT)
+
+
+
.. raw:: html
@@ -179,6 +232,23 @@ a number of ways you can leverage this backend to accelerate inference.
+.. raw:: html
+
+
+
+.. only:: html
+
+ .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_example_thumb.png
+ :alt:
+
+ :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_example.py`
+
+.. raw:: html
+
+
Engine Caching
+
+
+
.. raw:: html
diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js
index c01a50945c..6b61b71ddb 100644
--- a/docs/_static/documentation_options.js
+++ b/docs/_static/documentation_options.js
@@ -1,6 +1,6 @@
var DOCUMENTATION_OPTIONS = {
URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
- VERSION: 'v2.5.0.dev0+b3a8cdd',
+ VERSION: 'v2.5.0.dev0+a4a9419',
LANGUAGE: 'en',
COLLAPSE_INDEX: false,
BUILDER: 'html',
diff --git a/docs/cli/torchtrtc.html b/docs/cli/torchtrtc.html
index f6792e2ae9..e9ad12f49b 100644
--- a/docs/cli/torchtrtc.html
+++ b/docs/cli/torchtrtc.html
@@ -10,7 +10,7 @@
-
torchtrtc — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation
+
torchtrtc — Torch-TensorRT v2.5.0.dev0+a4a9419 documentation
@@ -275,7 +275,7 @@
- v2.5.0.dev0+b3a8cdd
+ v2.5.0.dev0+a4a9419
@@ -316,6 +316,9 @@
- Deploying Torch-TensorRT Programs
- DLA
- Torch Compile Advanced Usage
+
- Deploy Quantized Models using Torch-TensorRT
+
- Engine Caching
+
- Refitting Torch-TensorRT Programs with New Weights
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend
Dynamo Frontend