diff --git a/core/runtime/Platform.cpp b/core/runtime/Platform.cpp index a20159cd91..03d9e7580b 100644 --- a/core/runtime/Platform.cpp +++ b/core/runtime/Platform.cpp @@ -36,7 +36,6 @@ Platform::Platform() : _platform{Platform::PlatformEnum::kUNKNOWN} {} Platform::Platform(Platform::PlatformEnum val) : _platform{val} {} Platform::Platform(const std::string& platform_str) { - LOG_ERROR("Platform constructor: " << platform_str); auto name_map = get_name_to_platform_map(); auto it = name_map.find(platform_str); if (it != name_map.end()) { diff --git a/core/runtime/register_jit_hooks.cpp b/core/runtime/register_jit_hooks.cpp index b17c2988be..d02f098904 100644 --- a/core/runtime/register_jit_hooks.cpp +++ b/core/runtime/register_jit_hooks.cpp @@ -112,7 +112,6 @@ static auto TORCHTRT_UNUSED TRTEngineTSRegistrtion = return serialize_info; }, [](std::vector serialized_info) -> c10::intrusive_ptr { - LOG_ERROR(serialized_info[TARGET_PLATFORM_IDX]); serialized_info[ENGINE_IDX] = base64_decode(serialized_info[ENGINE_IDX]); TRTEngine::verify_serialization_fmt(serialized_info); return c10::make_intrusive(serialized_info); diff --git a/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html b/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html index 31e6b0f3e3..87984fabb9 100644 --- a/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html +++ b/docs/_cpp_api/classtorch__tensorrt_1_1DataType.html @@ -10,7 +10,7 @@ - Class DataType — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + Class DataType — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -275,7 +275,7 @@
- v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
@@ -316,6 +316,9 @@
  • Deploying Torch-TensorRT Programs
  • DLA
  • Torch Compile Advanced Usage
  • +
  • Deploy Quantized Models using Torch-TensorRT
  • +
  • Engine Caching
  • +
  • Refitting Torch-TensorRT Programs with New Weights
  • Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -339,7 +342,6 @@
    • Example notebooks
    • Compiling ResNet using the Torch-TensorRT torch.compile Backend
    • Compiling a Transformer using torch.compile and TensorRT
    • -
    • Torch Compile Advanced Usage
    • Torch Compile Stable Diffusion
    • Torch Export with Cudagraphs
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • diff --git a/docs/_downloads/06a1dddfb8c2b5515b697700d863a453/engine_caching_bert_example.ipynb b/docs/_downloads/06a1dddfb8c2b5515b697700d863a453/engine_caching_bert_example.ipynb new file mode 100644 index 0000000000..fe7a070b26 --- /dev/null +++ b/docs/_downloads/06a1dddfb8c2b5515b697700d863a453/engine_caching_bert_example.ipynb @@ -0,0 +1,43 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n\n# Engine Caching (BERT)\n\nSmall caching example on BERT.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import numpy as np\nimport torch\nimport torch_tensorrt\nfrom engine_caching_example import remove_timing_cache\nfrom transformers import BertModel\n\nnp.random.seed(0)\ntorch.manual_seed(0)\n\nmodel = BertModel.from_pretrained(\"bert-base-uncased\", return_dict=False).cuda().eval()\ninputs = [\n torch.randint(0, 2, (1, 14), dtype=torch.int32).to(\"cuda\"),\n torch.randint(0, 2, (1, 14), dtype=torch.int32).to(\"cuda\"),\n]\n\n\ndef compile_bert(iterations=3):\n times = []\n start = torch.cuda.Event(enable_timing=True)\n end = torch.cuda.Event(enable_timing=True)\n\n # The 1st iteration is to measure the compilation time without engine caching\n # The 2nd and 3rd iterations are to measure the compilation time with engine caching.\n # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.\n # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.\n for i in range(iterations):\n # remove timing cache and reset dynamo for engine caching messurement\n remove_timing_cache()\n torch._dynamo.reset()\n\n if i == 0:\n cache_built_engines = False\n reuse_cached_engines = False\n else:\n cache_built_engines = True\n reuse_cached_engines = True\n\n start.record()\n compilation_kwargs = {\n \"use_python_runtime\": False,\n \"enabled_precisions\": {torch.float},\n \"truncate_double\": True,\n \"debug\": False,\n \"min_block_size\": 1,\n \"make_refitable\": True,\n \"cache_built_engines\": cache_built_engines,\n \"reuse_cached_engines\": reuse_cached_engines,\n \"engine_cache_dir\": \"/tmp/torch_trt_bert_engine_cache\",\n \"engine_cache_size\": 1 << 30, # 1GB\n }\n optimized_model = torch.compile(\n model,\n backend=\"torch_tensorrt\",\n options=compilation_kwargs,\n )\n optimized_model(*inputs)\n end.record()\n torch.cuda.synchronize()\n times.append(start.elapsed_time(end))\n\n print(\"-----compile bert-----> compilation time:\\n\", times, \"milliseconds\")\n\n\nif __name__ == \"__main__\":\n compile_bert()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/_downloads/1c759c0181fe2845e5579cc82e5b7a7a/engine_caching_example.py b/docs/_downloads/1c759c0181fe2845e5579cc82e5b7a7a/engine_caching_example.py new file mode 100644 index 0000000000..5154dc1e2c --- /dev/null +++ b/docs/_downloads/1c759c0181fe2845e5579cc82e5b7a7a/engine_caching_example.py @@ -0,0 +1,288 @@ +""" + +.. _engine_caching_example: + +Engine Caching +======================= + +As model sizes increase, the cost of compilation will as well. With AOT methods +like ``torch.dynamo.compile``, this cost is paid upfront. However if the weights +change, the session ends or you are using JIT methods like ``torch.compile``, as +graphs get invalidated they get re-compiled, this cost will get paid repeatedly. +Engine caching is a way to mitigate this cost by saving constructed engines to disk +and re-using them when possible. This tutorial demonstrates how to use engine caching +with TensorRT in PyTorch. Engine caching can significantly speed up subsequent model +compilations reusing previously built TensorRT engines. + +We'll explore two approaches: + + 1. Using torch_tensorrt.dynamo.compile + 2. Using torch.compile with the TensorRT backend + +The example uses a pre-trained ResNet18 model and shows the +differences between compilation without caching, with caching enabled, +and when reusing cached engines. +""" + +import os +from typing import Dict, Optional + +import numpy as np +import torch +import torch_tensorrt as torch_trt +import torchvision.models as models +from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH +from torch_tensorrt.dynamo._engine_cache import BaseEngineCache + +np.random.seed(0) +torch.manual_seed(0) + +model = models.resnet18(pretrained=True).eval().to("cuda") +enabled_precisions = {torch.float} +debug = False +min_block_size = 1 +use_python_runtime = False + + +def remove_timing_cache(path=TIMING_CACHE_PATH): + if os.path.exists(path): + os.remove(path) + + +# %% +# Engine Caching for JIT Compilation +# ---------------------------------- +# +# The primary goal of engine caching is to help speed up JIT workflows. ``torch.compile`` +# provides a great deal of flexibility in model construction which makes it a good +# first tool to try when looking to speed up your workflow. However, historically +# the cost of compilation and in particular recompilation has been a barrier to entry +# for many users. If for some reason a subgraph gets invalidated, that graph is reconstructed +# scratch prior to the addition of engine caching. Now as engines are constructed, with ``cache_built_engines=True``, +# engines are saved to disk tied to a hash of their corresponding PyTorch subgraph. If +# in a subsequent compilation, either as part of this session or a new session, the cache will +# pull the built engine and **refit** the weights which can reduce compilation times by orders of magnitude. +# As such, in order to insert a new engine into the cache (i.e. ``cache_built_engines=True``), +# the engine must be refitable (``make_refittable=True``). See :ref:`refit_engine_example` for more details. + + +def torch_compile(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + # remove timing cache and reset dynamo just for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": enabled_precisions, + "debug": debug, + "min_block_size": min_block_size, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + }, + ) + compiled_model(*inputs) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("----------------torch_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") + + +torch_compile() + +# %% +# Engine Caching for AOT Compilation +# ---------------------------------- +# Similarly to the JIT workflow, AOT workflows can benefit from engine caching. +# As the same architecture or common subgraphs get recompiled, the cache will pull +# previously built engines and refit the weights. + + +def dynamo_compile(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + # Mark the dim0 of inputs as dynamic + batch = torch.export.Dim("batch", min=1, max=200) + exp_program = torch.export.export( + model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] + remove_timing_cache() # remove timing cache just for engine caching messurement + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + use_python_runtime=use_python_runtime, + enabled_precisions=enabled_precisions, + debug=debug, + min_block_size=min_block_size, + make_refitable=True, + cache_built_engines=cache_built_engines, + reuse_cached_engines=reuse_cached_engines, + engine_cache_size=1 << 30, # 1GB + ) + # output = trt_gm(*inputs) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("----------------dynamo_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") + + +dynamo_compile() + +# %% +# Custom Engine Cache +# ---------------------- +# +# By default, the engine cache is stored in the system's temporary directory. Both the cache directory and +# size limit can be customized by passing ``engine_cache_dir`` and ``engine_cache_size``. +# Users can also define their own engine cache implementation by extending the ``BaseEngineCache`` class. +# This allows for remote or shared caching if so desired. +# +# The custom engine cache should implement the following methods: +# - ``save``: Save the engine blob to the cache. +# - ``load``: Load the engine blob from the cache. +# +# The hash provided by the cache systen is a weight agnostic hash of the originating PyTorch subgraph (post lowering). +# The blob contains a serialized engine, calling spec data, and weight map information in the pickle format +# +# Below is an example of a custom engine cache implementation that implents a ``RAMEngineCache``. + + +class RAMEngineCache(BaseEngineCache): + def __init__( + self, + ) -> None: + """ + Constructs a user held engine cache in memory. + """ + self.engine_cache: Dict[str, bytes] = {} + + def save( + self, + hash: str, + blob: bytes, + ): + """ + Insert the engine blob to the cache. + + Args: + hash (str): The hash key to associate with the engine blob. + blob (bytes): The engine blob to be saved. + + Returns: + None + """ + self.engine_cache[hash] = blob + + def load(self, hash: str) -> Optional[bytes]: + """ + Load the engine blob from the cache. + + Args: + hash (str): The hash key of the engine to load. + + Returns: + Optional[bytes]: The engine blob if found, None otherwise. + """ + if hash in self.engine_cache: + return self.engine_cache[hash] + else: + return None + + +def torch_compile_my_cache(iterations=3): + times = [] + engine_cache = RAMEngineCache() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + # remove timing cache and reset dynamo just for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": enabled_precisions, + "debug": debug, + "min_block_size": min_block_size, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "custom_engine_cache": engine_cache, + }, + ) + compiled_model(*inputs) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("----------------torch_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") + + +torch_compile_my_cache() diff --git a/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py b/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py new file mode 100644 index 0000000000..a26305e4a3 --- /dev/null +++ b/docs/_downloads/2a9ac10f2667047a7f398d1593b7ca33/torch_export_gpt2.py @@ -0,0 +1,86 @@ +""" +.. _torch_export_gpt2: + +Compiling GPT2 using the Torch-TensorRT with dynamo backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a GPT2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import export_llm, generate + +# %% + +# Define the parameters and initialize the model +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +with torch.no_grad(): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", + ).eval() + +# %% +# Tokenize a sample input prompt and get pytorch model outputs +prompt = "I enjoy walking with my cute dog" +model_inputs = tokenizer(prompt, return_tensors="pt") +input_ids = model_inputs["input_ids"] + +# Auto-regressive generation loop for greedy decoding using PyTorch model +# We use a custom generate function which is very similar to the huggingface one. +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + + +# %% +# Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Export the GPT2 model into an ExportedProgram which is input of TRT compilation +gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) +trt_model = torch_tensorrt.dynamo.compile( + gpt2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + truncate_double=True, + device=DEVICE, + disable_tf32=True, +) + +# Auto-regressive generation loop for greedy decoding using TensorRT model +# We use a custom generate function which is very similar to the huggingface one. +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), +) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), +) + +# %% +# The output sentences should look like +# ============================= +# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# ============================= +# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my diff --git a/docs/_downloads/34421db2f2a82ea2b3d9a9cc85624784/torch_export_gpt2.ipynb b/docs/_downloads/34421db2f2a82ea2b3d9a9cc85624784/torch_export_gpt2.ipynb new file mode 100644 index 0000000000..4623ccd105 --- /dev/null +++ b/docs/_downloads/34421db2f2a82ea2b3d9a9cc85624784/torch_export_gpt2.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n\n# Compiling GPT2 using the Torch-TensorRT with dynamo backend\n\nThis interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a GPT2 model.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports and Model Definition\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom utils import export_llm, generate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Define the parameters and initialize the model\nMAX_TOKENS = 32\nDEVICE = torch.device(\"cuda:0\")\n\n# Define the GPT2 model from hugging face\n# kv_cache is not supported in Torch-TRT currently.\n# CPU is used here so that GPU memory is reserved for TRT compilation.\nwith torch.no_grad():\n tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n model = AutoModelForCausalLM.from_pretrained(\n \"gpt2\",\n pad_token_id=tokenizer.eos_token_id,\n use_cache=False,\n attn_implementation=\"eager\",\n ).eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tokenize a sample input prompt and get pytorch model outputs\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prompt = \"I enjoy walking with my cute dog\"\nmodel_inputs = tokenizer(prompt, return_tensors=\"pt\")\ninput_ids = model_inputs[\"input_ids\"]\n\n# Auto-regressive generation loop for greedy decoding using PyTorch model\n# We use a custom generate function which is very similar to the huggingface one.\npyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Export the GPT2 model into an ExportedProgram which is input of TRT compilation\ngpt2_ep = export_llm(model, input_ids, max_seq_len=1024)\ntrt_model = torch_tensorrt.dynamo.compile(\n gpt2_ep,\n inputs=[input_ids],\n enabled_precisions={torch.float32},\n truncate_double=True,\n device=DEVICE,\n disable_tf32=True,\n)\n\n# Auto-regressive generation loop for greedy decoding using TensorRT model\n# We use a custom generate function which is very similar to the huggingface one.\n# Move inputs to GPU\ninput_ids = input_ids.to(DEVICE)\ntrt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Decode the output sentences of PyTorch and TensorRT\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(\"=============================\")\nprint(\n \"Pytorch model generated text: \",\n tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),\n)\nprint(\"=============================\")\nprint(\n \"TensorRT model generated text: \",\n tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),\n)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The output sentences should look like\nPytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my\n=============================\nTensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my\n\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/_downloads/3454ee6d4b68e83cdf0c757f0059986b/engine_caching_example.ipynb b/docs/_downloads/3454ee6d4b68e83cdf0c757f0059986b/engine_caching_example.ipynb new file mode 100644 index 0000000000..5df63748b4 --- /dev/null +++ b/docs/_downloads/3454ee6d4b68e83cdf0c757f0059986b/engine_caching_example.ipynb @@ -0,0 +1,97 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n\n# Engine Caching\n\nAs model sizes increase, the cost of compilation will as well. With AOT methods\nlike ``torch.dynamo.compile``, this cost is paid upfront. However if the weights\nchange, the session ends or you are using JIT methods like ``torch.compile``, as\ngraphs get invalidated they get re-compiled, this cost will get paid repeatedly.\nEngine caching is a way to mitigate this cost by saving constructed engines to disk\nand re-using them when possible. This tutorial demonstrates how to use engine caching\nwith TensorRT in PyTorch. Engine caching can significantly speed up subsequent model\ncompilations reusing previously built TensorRT engines.\n\nWe'll explore two approaches:\n\n 1. Using torch_tensorrt.dynamo.compile\n 2. Using torch.compile with the TensorRT backend\n\nThe example uses a pre-trained ResNet18 model and shows the\ndifferences between compilation without caching, with caching enabled,\nand when reusing cached engines.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\nfrom typing import Dict, Optional\n\nimport numpy as np\nimport torch\nimport torch_tensorrt as torch_trt\nimport torchvision.models as models\nfrom torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH\nfrom torch_tensorrt.dynamo._engine_cache import BaseEngineCache\n\nnp.random.seed(0)\ntorch.manual_seed(0)\n\nmodel = models.resnet18(pretrained=True).eval().to(\"cuda\")\nenabled_precisions = {torch.float}\ndebug = False\nmin_block_size = 1\nuse_python_runtime = False\n\n\ndef remove_timing_cache(path=TIMING_CACHE_PATH):\n if os.path.exists(path):\n os.remove(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Engine Caching for JIT Compilation\n\nThe primary goal of engine caching is to help speed up JIT workflows. ``torch.compile``\nprovides a great deal of flexibility in model construction which makes it a good\nfirst tool to try when looking to speed up your workflow. However, historically\nthe cost of compilation and in particular recompilation has been a barrier to entry\nfor many users. If for some reason a subgraph gets invalidated, that graph is reconstructed\nscratch prior to the addition of engine caching. Now as engines are constructed, with ``cache_built_engines=True``,\nengines are saved to disk tied to a hash of their corresponding PyTorch subgraph. If\nin a subsequent compilation, either as part of this session or a new session, the cache will\npull the built engine and **refit** the weights which can reduce compilation times by orders of magnitude.\nAs such, in order to insert a new engine into the cache (i.e. ``cache_built_engines=True``),\nthe engine must be refitable (``make_refittable=True``). See `refit_engine_example` for more details.\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def torch_compile(iterations=3):\n times = []\n start = torch.cuda.Event(enable_timing=True)\n end = torch.cuda.Event(enable_timing=True)\n\n # The 1st iteration is to measure the compilation time without engine caching\n # The 2nd and 3rd iterations are to measure the compilation time with engine caching.\n # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.\n # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.\n for i in range(iterations):\n inputs = [torch.rand((100, 3, 224, 224)).to(\"cuda\")]\n # remove timing cache and reset dynamo just for engine caching messurement\n remove_timing_cache()\n torch._dynamo.reset()\n\n if i == 0:\n cache_built_engines = False\n reuse_cached_engines = False\n else:\n cache_built_engines = True\n reuse_cached_engines = True\n\n start.record()\n compiled_model = torch.compile(\n model,\n backend=\"tensorrt\",\n options={\n \"use_python_runtime\": True,\n \"enabled_precisions\": enabled_precisions,\n \"debug\": debug,\n \"min_block_size\": min_block_size,\n \"make_refitable\": True,\n \"cache_built_engines\": cache_built_engines,\n \"reuse_cached_engines\": reuse_cached_engines,\n },\n )\n compiled_model(*inputs) # trigger the compilation\n end.record()\n torch.cuda.synchronize()\n times.append(start.elapsed_time(end))\n\n print(\"----------------torch_compile----------------\")\n print(\"disable engine caching, used:\", times[0], \"ms\")\n print(\"enable engine caching to cache engines, used:\", times[1], \"ms\")\n print(\"enable engine caching to reuse engines, used:\", times[2], \"ms\")\n\n\ntorch_compile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Engine Caching for AOT Compilation\nSimilarly to the JIT workflow, AOT workflows can benefit from engine caching.\nAs the same architecture or common subgraphs get recompiled, the cache will pull\npreviously built engines and refit the weights.\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def dynamo_compile(iterations=3):\n times = []\n start = torch.cuda.Event(enable_timing=True)\n end = torch.cuda.Event(enable_timing=True)\n\n example_inputs = (torch.randn((100, 3, 224, 224)).to(\"cuda\"),)\n # Mark the dim0 of inputs as dynamic\n batch = torch.export.Dim(\"batch\", min=1, max=200)\n exp_program = torch.export.export(\n model, args=example_inputs, dynamic_shapes={\"x\": {0: batch}}\n )\n\n # The 1st iteration is to measure the compilation time without engine caching\n # The 2nd and 3rd iterations are to measure the compilation time with engine caching.\n # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.\n # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.\n for i in range(iterations):\n inputs = [torch.rand((100 + i, 3, 224, 224)).to(\"cuda\")]\n remove_timing_cache() # remove timing cache just for engine caching messurement\n if i == 0:\n cache_built_engines = False\n reuse_cached_engines = False\n else:\n cache_built_engines = True\n reuse_cached_engines = True\n\n start.record()\n trt_gm = torch_trt.dynamo.compile(\n exp_program,\n tuple(inputs),\n use_python_runtime=use_python_runtime,\n enabled_precisions=enabled_precisions,\n debug=debug,\n min_block_size=min_block_size,\n make_refitable=True,\n cache_built_engines=cache_built_engines,\n reuse_cached_engines=reuse_cached_engines,\n engine_cache_size=1 << 30, # 1GB\n )\n # output = trt_gm(*inputs)\n end.record()\n torch.cuda.synchronize()\n times.append(start.elapsed_time(end))\n\n print(\"----------------dynamo_compile----------------\")\n print(\"disable engine caching, used:\", times[0], \"ms\")\n print(\"enable engine caching to cache engines, used:\", times[1], \"ms\")\n print(\"enable engine caching to reuse engines, used:\", times[2], \"ms\")\n\n\ndynamo_compile()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom Engine Cache\n\nBy default, the engine cache is stored in the system's temporary directory. Both the cache directory and\nsize limit can be customized by passing ``engine_cache_dir`` and ``engine_cache_size``.\nUsers can also define their own engine cache implementation by extending the ``BaseEngineCache`` class.\nThis allows for remote or shared caching if so desired.\n\nThe custom engine cache should implement the following methods:\n - ``save``: Save the engine blob to the cache.\n - ``load``: Load the engine blob from the cache.\n\nThe hash provided by the cache systen is a weight agnostic hash of the originating PyTorch subgraph (post lowering).\nThe blob contains a serialized engine, calling spec data, and weight map information in the pickle format\n\nBelow is an example of a custom engine cache implementation that implents a ``RAMEngineCache``.\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class RAMEngineCache(BaseEngineCache):\n def __init__(\n self,\n ) -> None:\n \"\"\"\n Constructs a user held engine cache in memory.\n \"\"\"\n self.engine_cache: Dict[str, bytes] = {}\n\n def save(\n self,\n hash: str,\n blob: bytes,\n ):\n \"\"\"\n Insert the engine blob to the cache.\n\n Args:\n hash (str): The hash key to associate with the engine blob.\n blob (bytes): The engine blob to be saved.\n\n Returns:\n None\n \"\"\"\n self.engine_cache[hash] = blob\n\n def load(self, hash: str) -> Optional[bytes]:\n \"\"\"\n Load the engine blob from the cache.\n\n Args:\n hash (str): The hash key of the engine to load.\n\n Returns:\n Optional[bytes]: The engine blob if found, None otherwise.\n \"\"\"\n if hash in self.engine_cache:\n return self.engine_cache[hash]\n else:\n return None\n\n\ndef torch_compile_my_cache(iterations=3):\n times = []\n engine_cache = RAMEngineCache()\n start = torch.cuda.Event(enable_timing=True)\n end = torch.cuda.Event(enable_timing=True)\n\n # The 1st iteration is to measure the compilation time without engine caching\n # The 2nd and 3rd iterations are to measure the compilation time with engine caching.\n # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.\n # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.\n for i in range(iterations):\n inputs = [torch.rand((100, 3, 224, 224)).to(\"cuda\")]\n # remove timing cache and reset dynamo just for engine caching messurement\n remove_timing_cache()\n torch._dynamo.reset()\n\n if i == 0:\n cache_built_engines = False\n reuse_cached_engines = False\n else:\n cache_built_engines = True\n reuse_cached_engines = True\n\n start.record()\n compiled_model = torch.compile(\n model,\n backend=\"tensorrt\",\n options={\n \"use_python_runtime\": True,\n \"enabled_precisions\": enabled_precisions,\n \"debug\": debug,\n \"min_block_size\": min_block_size,\n \"make_refitable\": True,\n \"cache_built_engines\": cache_built_engines,\n \"reuse_cached_engines\": reuse_cached_engines,\n \"custom_engine_cache\": engine_cache,\n },\n )\n compiled_model(*inputs) # trigger the compilation\n end.record()\n torch.cuda.synchronize()\n times.append(start.elapsed_time(end))\n\n print(\"----------------torch_compile----------------\")\n print(\"disable engine caching, used:\", times[0], \"ms\")\n print(\"enable engine caching to cache engines, used:\", times[1], \"ms\")\n print(\"enable engine caching to reuse engines, used:\", times[2], \"ms\")\n\n\ntorch_compile_my_cache()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/_downloads/6a6052d9668b2cb8332d349d328e21c1/_rendered_examples_jupyter.zip b/docs/_downloads/6a6052d9668b2cb8332d349d328e21c1/_rendered_examples_jupyter.zip index 9ebd46bac6..e6fa07cc75 100644 Binary files a/docs/_downloads/6a6052d9668b2cb8332d349d328e21c1/_rendered_examples_jupyter.zip and b/docs/_downloads/6a6052d9668b2cb8332d349d328e21c1/_rendered_examples_jupyter.zip differ diff --git a/docs/_downloads/798cda8f83bd9f5e2cc93f329a04332c/_rendered_examples_python.zip b/docs/_downloads/798cda8f83bd9f5e2cc93f329a04332c/_rendered_examples_python.zip index 43eba5e840..5633b02960 100644 Binary files a/docs/_downloads/798cda8f83bd9f5e2cc93f329a04332c/_rendered_examples_python.zip and b/docs/_downloads/798cda8f83bd9f5e2cc93f329a04332c/_rendered_examples_python.zip differ diff --git a/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py b/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py new file mode 100644 index 0000000000..195944688b --- /dev/null +++ b/docs/_downloads/7b7004dc2ea6f839be532665e16e0426/torch_export_llama2.py @@ -0,0 +1,90 @@ +""" +.. _torch_export_llama2: + +Compiling Llama2 using the Torch-TensorRT with dynamo backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a Llama2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import export_llm, generate + +# %% +# Define the parameters and initialize the model +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the Llama2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +llama_path = "meta-llama/Llama-2-7b-chat-hf" +with torch.no_grad(): + model = AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ).eval() + +tokenizer = AutoTokenizer.from_pretrained(llama_path) + +# %% +# Tokenize a sample input prompt and get pytorch model outputs +prompt = "What is dynamic programming?" +model_inputs = tokenizer(prompt, return_tensors="pt") +input_ids = model_inputs.input_ids + +# Auto-regressive generation loop for greedy decoding using PyTorch model +# We use a custom generate function which is very similar to the huggingface one. +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Export the llama2 model into an ExportedProgram which is input of TRT compilation +llama2_ep = export_llm(model, input_ids, max_seq_len=64) +trt_model = torch_tensorrt.dynamo.compile( + llama2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + min_block_size=1, + truncate_double=True, + device=DEVICE, + disable_tf32=True, +) + +# Auto-regressive generation loop for greedy decoding using TensorRT model +# We use a custom generate function which is very similar to the huggingface one. +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.batch_decode( + pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0], +) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.batch_decode( + trt_gen_tokens, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + )[0], +) + +# %% +# The output sentences should look like +# ============================= +# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +# ============================= +# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my diff --git a/docs/_downloads/7e3a125a2d4ba8274a41b46f5e0723fa/refit_engine_example.py b/docs/_downloads/7e3a125a2d4ba8274a41b46f5e0723fa/refit_engine_example.py index c8cd5590d3..1feb033a3a 100644 --- a/docs/_downloads/7e3a125a2d4ba8274a41b46f5e0723fa/refit_engine_example.py +++ b/docs/_downloads/7e3a125a2d4ba8274a41b46f5e0723fa/refit_engine_example.py @@ -1,19 +1,26 @@ """ .. _refit_engine_example: -Refit TenorRT Graph Module with Torch-TensorRT +Refitting Torch-TensorRT Programs with New Weights =================================================================== -We are going to demonstrate how a compiled TensorRT Graph Module can be refitted with updated weights. - -In many cases, we frequently update the weights of models, such as applying various LoRA to Stable Diffusion or constant A/B testing of AI products. -That poses challenges for TensorRT inference optimizations, as compiling the TensorRT engines takes significant time, making repetitive compilation highly inefficient. -Torch-TensorRT supports refitting TensorRT graph modules without re-compiling the engine, considerably accelerating the workflow. +Compilation is an expensive operation as it involves many graph transformations, translations +and optimizations applied on the model. In cases were the weights of a model might be updated +occasionally (e.g. inserting LoRA adapters), the large cost of recompilation can make it infeasible +to use TensorRT if the compiled program needed to be built from scratch each time. Torch-TensorRT +provides a PyTorch native mechanism to update the weights of a compiled TensorRT program without +recompiling from scratch through weight refitting. In this tutorial, we are going to walk through -1. Compiling a PyTorch model to a TensorRT Graph Module -2. Save and load a graph module -3. Refit the graph module + + 1. Compiling a PyTorch model to a TensorRT Graph Module + 2. Save and load a graph module + 3. Refit the graph module + +This tutorial focuses mostly on the AOT workflow where it is most likely that a user might need to +manually refit a module. In the JIT workflow, weight changes trigger recompilation. As the engine +has previously been built, with an engine cache enabled, Torch-TensorRT can automatically recognize +a previously built engine, trigger refit and short cut recompilation on behalf of the user (see: :ref:`engine_caching_example`). """ # %% @@ -36,10 +43,17 @@ # %% -# Compile the module for the first time and save it. -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -model = models.resnet18(pretrained=True).eval().to("cuda") +# Make a Refitable Compilation Program +# --------------------------------------- +# +# The inital step is to compile a module and save it as with a normal. Note that there is an +# additional parameter `make_refitable` that is set to `True`. This parameter is used to +# indicate that the engine being built should support weight refitting later. Engines built without +# these setttings will not be able to be refit. +# +# In this case we are going to compile a ResNet18 model with randomly initialized weights and save it. + +model = models.resnet18(pretrained=False).eval().to("cuda") exp_program = torch.export.export(model, tuple(inputs)) enabled_precisions = {torch.float} debug = False @@ -59,16 +73,20 @@ ) # Output is a torch.fx.GraphModule # Save the graph module as an exported program -# This is only supported when use_python_runtime = False torch_trt.save(trt_gm, "./compiled.ep", inputs=inputs) # %% -# Refit the module with update model weights -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Refit the Program with Pretrained Weights +# ------------------------------------------ +# +# Random weights are not useful for inference. But now instead of recompiling the model, we can +# refit the model with the pretrained weights. This is done by setting up another PyTorch module +# with the target weights and exporting it as an ExportedProgram. Then the ``refit_module_weights`` +# function is used to update the weights of the compiled module with the new weights. # Create and compile the updated model -model2 = models.resnet18(pretrained=False).eval().to("cuda") +model2 = models.resnet18(pretrained=True).eval().to("cuda") exp_program2 = torch.export.export(model2, tuple(inputs)) @@ -91,8 +109,32 @@ print("Refit successfully!") # %% -# Alternative Workflow using Python Runtime +# +# Advanced Usage # ----------------------------- - -# Currently python runtime does not support engine serialization. So the refitting will be done in the same runtime. -# This usecase is more useful when you need to switch different weights in the same runtime, such as using Stable Diffusion. +# +# There are a number of settings you can use to control the refit process +# +# Weight Map Cache +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Weight refitting works by matching the weights of the compiled module with the new weights from +# the user supplied ExportedProgram. Since 1:1 name matching from PyTorch to TensorRT is hard to accomplish, +# the only gaurenteed way to match weights at *refit-time* is to pass the new ExportedProgram through the +# early phases of the compilation process to generate near identical weight names. This can be expensive +# and is not always necessary. +# +# To avoid this, **At initial compile**, Torch-TensorRt will attempt to cache a direct mapping from PyTorch +# weights to TensorRT weights. This cache is stored in the compiled module as metadata and can be used +# to speed up refit. If the cache is not present, the refit system will fallback to rebuilding the mapping at +# refit-time. Use of this cache is controlled by the ``use_weight_map_cache`` parameter. +# +# Since the cache uses a heuristic based system for matching PyTorch and TensorRT weights, you may want to verify the refitting. This can be done by setting +# ``verify_output`` to True and providing sample ``arg_inputs`` and ``kwarg_inputs``. When this is done, the refit +# system will run the refitted module and the user supplied module on the same inputs and compare the outputs. +# +# In-Place Refit +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# ``in_place`` allows the user to refit the module in place. This is useful when the user wants to update the weights +# of the compiled module without creating a new module. diff --git a/docs/_downloads/9e148ac48490c84d381ee281904f3226/torch_export_llama2.ipynb b/docs/_downloads/9e148ac48490c84d381ee281904f3226/torch_export_llama2.ipynb new file mode 100644 index 0000000000..d19be753a0 --- /dev/null +++ b/docs/_downloads/9e148ac48490c84d381ee281904f3226/torch_export_llama2.ipynb @@ -0,0 +1,129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n\n# Compiling Llama2 using the Torch-TensorRT with dynamo backend\n\nThis interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a Llama2 model.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Imports and Model Definition\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import torch\nimport torch_tensorrt\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\nfrom utils import export_llm, generate" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define the parameters and initialize the model\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "MAX_TOKENS = 32\nDEVICE = torch.device(\"cuda:0\")\n\n# Define the Llama2 model from hugging face\n# kv_cache is not supported in Torch-TRT currently.\n# CPU is used here so that GPU memory is reserved for TRT compilation.\nllama_path = \"meta-llama/Llama-2-7b-chat-hf\"\nwith torch.no_grad():\n model = AutoModelForCausalLM.from_pretrained(\n llama_path, use_cache=False, attn_implementation=\"eager\"\n ).eval()\n\ntokenizer = AutoTokenizer.from_pretrained(llama_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Tokenize a sample input prompt and get pytorch model outputs\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prompt = \"What is dynamic programming?\"\nmodel_inputs = tokenizer(prompt, return_tensors=\"pt\")\ninput_ids = model_inputs.input_ids\n\n# Auto-regressive generation loop for greedy decoding using PyTorch model\n# We use a custom generate function which is very similar to the huggingface one.\npyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Export the llama2 model into an ExportedProgram which is input of TRT compilation\nllama2_ep = export_llm(model, input_ids, max_seq_len=64)\ntrt_model = torch_tensorrt.dynamo.compile(\n llama2_ep,\n inputs=[input_ids],\n enabled_precisions={torch.float32},\n min_block_size=1,\n truncate_double=True,\n device=DEVICE,\n disable_tf32=True,\n)\n\n# Auto-regressive generation loop for greedy decoding using TensorRT model\n# We use a custom generate function which is very similar to the huggingface one.\n# Move inputs to GPU\ninput_ids = input_ids.to(DEVICE)\ntrt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Decode the output sentences of PyTorch and TensorRT\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(\"=============================\")\nprint(\n \"Pytorch model generated text: \",\n tokenizer.batch_decode(\n pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False\n )[0],\n)\nprint(\"=============================\")\nprint(\n \"TensorRT model generated text: \",\n tokenizer.batch_decode(\n trt_gen_tokens,\n skip_special_tokens=True,\n clean_up_tokenization_spaces=False,\n )[0],\n)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The output sentences should look like\nPytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my\n=============================\nTensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my\n\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/docs/_downloads/d9a9caffd95dc397ffb9ea9d37a89f06/refit_engine_example.ipynb b/docs/_downloads/d9a9caffd95dc397ffb9ea9d37a89f06/refit_engine_example.ipynb index 901f5b7743..8cf6de7904 100644 --- a/docs/_downloads/d9a9caffd95dc397ffb9ea9d37a89f06/refit_engine_example.ipynb +++ b/docs/_downloads/d9a9caffd95dc397ffb9ea9d37a89f06/refit_engine_example.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n\n# Refit TenorRT Graph Module with Torch-TensorRT\n\nWe are going to demonstrate how a compiled TensorRT Graph Module can be refitted with updated weights.\n\nIn many cases, we frequently update the weights of models, such as applying various LoRA to Stable Diffusion or constant A/B testing of AI products.\nThat poses challenges for TensorRT inference optimizations, as compiling the TensorRT engines takes significant time, making repetitive compilation highly inefficient.\nTorch-TensorRT supports refitting TensorRT graph modules without re-compiling the engine, considerably accelerating the workflow.\n\nIn this tutorial, we are going to walk through\n1. Compiling a PyTorch model to a TensorRT Graph Module\n2. Save and load a graph module\n3. Refit the graph module\n" + "\n\n# Refitting Torch-TensorRT Programs with New Weights\n\nCompilation is an expensive operation as it involves many graph transformations, translations\nand optimizations applied on the model. In cases were the weights of a model might be updated\noccasionally (e.g. inserting LoRA adapters), the large cost of recompilation can make it infeasible\nto use TensorRT if the compiled program needed to be built from scratch each time. Torch-TensorRT\nprovides a PyTorch native mechanism to update the weights of a compiled TensorRT program without\nrecompiling from scratch through weight refitting.\n\nIn this tutorial, we are going to walk through\n\n 1. Compiling a PyTorch model to a TensorRT Graph Module\n 2. Save and load a graph module\n 3. Refit the graph module\n\nThis tutorial focuses mostly on the AOT workflow where it is most likely that a user might need to\nmanually refit a module. In the JIT workflow, weight changes trigger recompilation. As the engine\nhas previously been built, with an engine cache enabled, Torch-TensorRT can automatically recognize\na previously built engine, trigger refit and short cut recompilation on behalf of the user (see: `engine_caching_example`).\n" ] }, { @@ -36,7 +36,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Compile the module for the first time and save it.\n\n" + "## Make a Refitable Compilation Program\n\nThe inital step is to compile a module and save it as with a normal. Note that there is an\nadditional parameter `make_refitable` that is set to `True`. This parameter is used to\nindicate that the engine being built should support weight refitting later. Engines built without\nthese setttings will not be able to be refit.\n\nIn this case we are going to compile a ResNet18 model with randomly initialized weights and save it.\n\n" ] }, { @@ -47,14 +47,14 @@ }, "outputs": [], "source": [ - "model = models.resnet18(pretrained=True).eval().to(\"cuda\")\nexp_program = torch.export.export(model, tuple(inputs))\nenabled_precisions = {torch.float}\ndebug = False\nworkspace_size = 20 << 30\nmin_block_size = 0\nuse_python_runtime = False\ntorch_executed_ops = {}\ntrt_gm = torch_trt.dynamo.compile(\n exp_program,\n tuple(inputs),\n use_python_runtime=use_python_runtime,\n enabled_precisions=enabled_precisions,\n debug=debug,\n min_block_size=min_block_size,\n torch_executed_ops=torch_executed_ops,\n make_refitable=True,\n) # Output is a torch.fx.GraphModule\n\n# Save the graph module as an exported program\n# This is only supported when use_python_runtime = False\ntorch_trt.save(trt_gm, \"./compiled.ep\", inputs=inputs)" + "model = models.resnet18(pretrained=False).eval().to(\"cuda\")\nexp_program = torch.export.export(model, tuple(inputs))\nenabled_precisions = {torch.float}\ndebug = False\nworkspace_size = 20 << 30\nmin_block_size = 0\nuse_python_runtime = False\ntorch_executed_ops = {}\ntrt_gm = torch_trt.dynamo.compile(\n exp_program,\n tuple(inputs),\n use_python_runtime=use_python_runtime,\n enabled_precisions=enabled_precisions,\n debug=debug,\n min_block_size=min_block_size,\n torch_executed_ops=torch_executed_ops,\n make_refitable=True,\n) # Output is a torch.fx.GraphModule\n\n# Save the graph module as an exported program\ntorch_trt.save(trt_gm, \"./compiled.ep\", inputs=inputs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Refit the module with update model weights\n\n" + "## Refit the Program with Pretrained Weights\n\nRandom weights are not useful for inference. But now instead of recompiling the model, we can\nrefit the model with the pretrained weights. This is done by setting up another PyTorch module\nwith the target weights and exporting it as an ExportedProgram. Then the ``refit_module_weights``\nfunction is used to update the weights of the compiled module with the new weights.\n\n" ] }, { @@ -65,25 +65,14 @@ }, "outputs": [], "source": [ - "# Create and compile the updated model\nmodel2 = models.resnet18(pretrained=False).eval().to(\"cuda\")\nexp_program2 = torch.export.export(model2, tuple(inputs))\n\n\ncompiled_trt_ep = torch_trt.load(\"./compiled.ep\")\n\n# This returns a new module with updated weights\nnew_trt_gm = refit_module_weights(\n compiled_module=compiled_trt_ep,\n new_weight_module=exp_program2,\n arg_inputs=inputs,\n)\n\n# Check the output\nexpected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)\nfor expected_output, refitted_output in zip(expected_outputs, refitted_outputs):\n assert torch.allclose(\n expected_output, refitted_output, 1e-2, 1e-2\n ), \"Refit Result is not correct. Refit failed\"\n\nprint(\"Refit successfully!\")" + "# Create and compile the updated model\nmodel2 = models.resnet18(pretrained=True).eval().to(\"cuda\")\nexp_program2 = torch.export.export(model2, tuple(inputs))\n\n\ncompiled_trt_ep = torch_trt.load(\"./compiled.ep\")\n\n# This returns a new module with updated weights\nnew_trt_gm = refit_module_weights(\n compiled_module=compiled_trt_ep,\n new_weight_module=exp_program2,\n arg_inputs=inputs,\n)\n\n# Check the output\nexpected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)\nfor expected_output, refitted_output in zip(expected_outputs, refitted_outputs):\n assert torch.allclose(\n expected_output, refitted_output, 1e-2, 1e-2\n ), \"Refit Result is not correct. Refit failed\"\n\nprint(\"Refit successfully!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Alternative Workflow using Python Runtime\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Currently python runtime does not support engine serialization. So the refitting will be done in the same runtime.\n# This usecase is more useful when you need to switch different weights in the same runtime, such as using Stable Diffusion." + "## Advanced Usage\n\nThere are a number of settings you can use to control the refit process\n\n### Weight Map Cache\n\nWeight refitting works by matching the weights of the compiled module with the new weights from\nthe user supplied ExportedProgram. Since 1:1 name matching from PyTorch to TensorRT is hard to accomplish,\nthe only gaurenteed way to match weights at *refit-time* is to pass the new ExportedProgram through the\nearly phases of the compilation process to generate near identical weight names. This can be expensive\nand is not always necessary.\n\nTo avoid this, **At initial compile**, Torch-TensorRt will attempt to cache a direct mapping from PyTorch\nweights to TensorRT weights. This cache is stored in the compiled module as metadata and can be used\nto speed up refit. If the cache is not present, the refit system will fallback to rebuilding the mapping at\nrefit-time. Use of this cache is controlled by the ``use_weight_map_cache`` parameter.\n\nSince the cache uses a heuristic based system for matching PyTorch and TensorRT weights, you may want to verify the refitting. This can be done by setting\n``verify_output`` to True and providing sample ``arg_inputs`` and ``kwarg_inputs``. When this is done, the refit\nsystem will run the refitted module and the user supplied module on the same inputs and compare the outputs.\n\n### In-Place Refit\n\n``in_place`` allows the user to refit the module in place. This is useful when the user wants to update the weights\nof the compiled module without creating a new module.\n\n" ] } ], diff --git a/docs/_downloads/fdd0cb7713d049345adec03926d28414/engine_caching_bert_example.py b/docs/_downloads/fdd0cb7713d049345adec03926d28414/engine_caching_bert_example.py new file mode 100644 index 0000000000..428c414a06 --- /dev/null +++ b/docs/_downloads/fdd0cb7713d049345adec03926d28414/engine_caching_bert_example.py @@ -0,0 +1,75 @@ +""" + +.. _engine_caching_bert_example: + +Engine Caching (BERT) +======================= + +Small caching example on BERT. +""" + +import numpy as np +import torch +import torch_tensorrt +from engine_caching_example import remove_timing_cache +from transformers import BertModel + +np.random.seed(0) +torch.manual_seed(0) + +model = BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval() +inputs = [ + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), +] + + +def compile_bert(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + # remove timing cache and reset dynamo for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compilation_kwargs = { + "use_python_runtime": False, + "enabled_precisions": {torch.float}, + "truncate_double": True, + "debug": False, + "min_block_size": 1, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache", + "engine_cache_size": 1 << 30, # 1GB + } + optimized_model = torch.compile( + model, + backend="torch_tensorrt", + options=compilation_kwargs, + ) + optimized_model(*inputs) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("-----compile bert-----> compilation time:\n", times, "milliseconds") + + +if __name__ == "__main__": + compile_bert() diff --git a/docs/_images/sphx_glr_engine_caching_bert_example_thumb.png b/docs/_images/sphx_glr_engine_caching_bert_example_thumb.png new file mode 100644 index 0000000000..8a5fed589d Binary files /dev/null and b/docs/_images/sphx_glr_engine_caching_bert_example_thumb.png differ diff --git a/docs/_images/sphx_glr_engine_caching_example_thumb.png b/docs/_images/sphx_glr_engine_caching_example_thumb.png new file mode 100644 index 0000000000..8a5fed589d Binary files /dev/null and b/docs/_images/sphx_glr_engine_caching_example_thumb.png differ diff --git a/docs/_images/sphx_glr_torch_export_gpt2_thumb.png b/docs/_images/sphx_glr_torch_export_gpt2_thumb.png new file mode 100644 index 0000000000..8a5fed589d Binary files /dev/null and b/docs/_images/sphx_glr_torch_export_gpt2_thumb.png differ diff --git a/docs/_images/sphx_glr_torch_export_llama2_thumb.png b/docs/_images/sphx_glr_torch_export_llama2_thumb.png new file mode 100644 index 0000000000..8a5fed589d Binary files /dev/null and b/docs/_images/sphx_glr_torch_export_llama2_thumb.png differ diff --git a/docs/_modules/index.html b/docs/_modules/index.html index 2cf3f16ce4..4ad1d8c40e 100644 --- a/docs/_modules/index.html +++ b/docs/_modules/index.html @@ -9,7 +9,7 @@ - Overview: module code — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + Overview: module code — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -272,7 +272,7 @@
      - v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
      @@ -313,6 +313,9 @@
    • Deploying Torch-TensorRT Programs
    • DLA
    • Torch Compile Advanced Usage
    • +
    • Deploy Quantized Models using Torch-TensorRT
    • +
    • Engine Caching
    • +
    • Refitting Torch-TensorRT Programs with New Weights

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -338,7 +341,6 @@
    • Example notebooks
    • Compiling ResNet using the Torch-TensorRT torch.compile Backend
    • Compiling a Transformer using torch.compile and TensorRT
    • -
    • Torch Compile Advanced Usage
    • Torch Compile Stable Diffusion
    • Torch Export with Cudagraphs
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • @@ -471,6 +473,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < dryrun_stats_display, parse_non_trt_nodes, ) +from torch_tensorrt.dynamo._engine_cache import BaseEngineCache, DiskEngineCache from torch_tensorrt.dynamo.conversion import ( CompilationSettings, UnsupportedOperatorException, @@ -488,8 +491,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < ) from torch_tensorrt.dynamo.utils import ( get_flat_args_with_check, - get_torch_inputs, - parse_complex_tensor_structs, + parse_graph_io, prepare_inputs, set_log_level, to_torch_device, @@ -536,6 +538,11 @@

      Source code for torch_tensorrt.dynamo._compiler

      < hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE, timing_cache_path: str = _defaults.TIMING_CACHE_PATH, lazy_engine_init: bool = _defaults.LAZY_ENGINE_INIT, + cache_built_engines: bool = _defaults.CACHE_BUILT_ENGINES, + reuse_cached_engines: bool = _defaults.REUSE_CACHED_ENGINES, + engine_cache_dir: str = _defaults.ENGINE_CACHE_DIR, + engine_cache_size: int = _defaults.ENGINE_CACHE_SIZE, + custom_engine_cache: Optional[BaseEngineCache] = _defaults.CUSTOM_ENGINE_CACHE, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -601,6 +608,11 @@

      Source code for torch_tensorrt.dynamo._compiler

      < hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation lazy_engine_init (bool): Defer setting up engines until the compilation of all engines is complete. Can allow larger models with multiple graph breaks to compile but can lead to oversubscription of GPU memory at runtime. + cache_built_engines (bool): Whether to save the compiled TRT engines to storage + reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage + engine_cache_dir (Optional[str]): Directory to store the cached TRT engines + engine_cache_size (Optional[int]): Maximum hard-disk space (bytes) to use for the engine cache, default is 1GB. If the cache exceeds this size, the oldest engines will be removed by default + custom_engine_cache (Optional[BaseEngineCache]): Engine cache instance to use for saving and loading engines. Users can provide their own engine cache by inheriting from BaseEngineCache. If used, engine_cache_dir and engine_cache_size will be ignored. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -673,10 +685,22 @@

      Source code for torch_tensorrt.dynamo._compiler

      < ) gm = exported_program.module() logger.debug("Input graph: " + str(gm.graph)) + # Apply lowering on the graph module gm = post_lowering(gm) logger.debug("Lowered Input graph: " + str(gm.graph)) + engine_cache = None + if cache_built_engines or reuse_cached_engines: + assert ( + make_refitable + ), "Engine caching requires make_refitable to be set to True" + engine_cache = ( + custom_engine_cache + if custom_engine_cache is not None + else DiskEngineCache(engine_cache_dir, engine_cache_size) + ) + compilation_options = { "enabled_precisions": ( enabled_precisions if enabled_precisions else _defaults.ENABLED_PRECISIONS @@ -710,11 +734,15 @@

      Source code for torch_tensorrt.dynamo._compiler

      < "hardware_compatible": hardware_compatible, "timing_cache_path": timing_cache_path, "lazy_engine_init": lazy_engine_init, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, } settings = CompilationSettings(**compilation_options) logger.info("Compilation Settings: %s\n", settings) - trt_gm = compile_module(gm, trt_arg_inputs, trt_kwarg_inputs, settings) + trt_gm = compile_module( + gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache + ) return trt_gm
      @@ -723,6 +751,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < sample_arg_inputs: Sequence[Input], sample_kwarg_inputs: Optional[dict[Any, Any]] = None, settings: CompilationSettings = CompilationSettings(), + engine_cache: Optional[BaseEngineCache] = None, ) -> torch.fx.GraphModule: """Compile a traced FX module @@ -733,6 +762,7 @@

      Source code for torch_tensorrt.dynamo._compiler

      < arg_inputs: Inputs to the module kwarg_inputs: kwargs to the module settings: Compilation settings + engine_cache: Engine cache instance to store/load compiled engines Returns: Compiled FX GraphModule """ @@ -752,14 +782,6 @@

      Source code for torch_tensorrt.dynamo._compiler

      < dryrun_tracker.total_ops_in_graph = total_ops dryrun_tracker.supported_ops_in_graph = num_supported_ops - dryrun_tracker.graph_input_shapes = parse_complex_tensor_structs( - sample_arg_inputs, - "shape", - lambda x: dict(x) if isinstance(x, dict) else tuple(x), - ) - dryrun_tracker.graph_input_dtypes = parse_complex_tensor_structs( - sample_arg_inputs, "dtype", lambda t: t.to(torch.dtype, use_default=True) - ) dryrun_tracker.compilation_settings = settings if settings.dryrun and settings.min_block_size > 1: @@ -846,6 +868,11 @@

      Source code for torch_tensorrt.dynamo._compiler

      < # Criteria for a module to be convertible to TRT if settings.use_fast_partitioner and "_run_on_acc" not in name: dryrun_tracker.to_run_in_torch.extend(parse_non_trt_nodes(submodule)) + logger.debug( + "Submodule in PyTorch: %s\n %s", + str(name), + str(submodule.graph), + ) continue subgraph_data = PerSubgraphData() @@ -880,28 +907,8 @@

      Source code for torch_tensorrt.dynamo._compiler

      < name, ) - subgraph_data.subgraph_input_shapes = parse_complex_tensor_structs( - submodule_inputs, - "shape", - lambda x: dict(x) if isinstance(x, dict) else tuple(x), - ) - subgraph_data.subgraph_input_dtypes = parse_complex_tensor_structs( - submodule_inputs, "dtype", lambda t: t.to(torch.dtype) - ) - - submodule_outputs = submodule( - *get_torch_inputs(submodule_inputs, to_torch_device(settings.device)) - ) - - subgraph_data.subgraph_output_shapes = parse_complex_tensor_structs( - submodule_outputs, - "shape", - lambda x: dict(x) if isinstance(x, dict) else tuple(x), - ) - subgraph_data.subgraph_output_dtypes = parse_complex_tensor_structs( - submodule_outputs, "dtype" - ) - + # Parse the subgraph I/O and store it + parse_graph_io(submodule, subgraph_data) dryrun_tracker.tensorrt_graph_count += 1 dryrun_tracker.per_subgraph_data.append(subgraph_data) @@ -912,27 +919,13 @@

      Source code for torch_tensorrt.dynamo._compiler

      < submodule_inputs, settings=settings, name=name, + engine_cache=engine_cache, ) trt_modules[name] = trt_module - torch_sample_arg_inputs = get_torch_inputs( - sample_arg_inputs, to_torch_device(settings.device) - ) - torch_sample_kwarg_inputs = get_torch_inputs( - sample_kwarg_inputs, to_torch_device(settings.device) - ) - sample_outputs = gm(*torch_sample_arg_inputs, **torch_sample_kwarg_inputs) - - if not isinstance(sample_outputs, (list, tuple)): - sample_outputs = [sample_outputs] - - dryrun_tracker.graph_output_shapes = parse_complex_tensor_structs( - sample_outputs, "shape", lambda x: dict(x) if isinstance(x, dict) else tuple(x) - ) - dryrun_tracker.graph_output_dtypes = parse_complex_tensor_structs( - sample_outputs, "dtype" - ) + # Parse the graph I/O and store it in dryrun tracker + parse_graph_io(gm, dryrun_tracker) # Replace all FX Modules with TRT Modules for name, trt_module in trt_modules.items(): @@ -1066,10 +1059,10 @@

      Source code for torch_tensorrt.dynamo._compiler

      < DeprecationWarning, stacklevel=2, ) - if not arg_inputs and not inputs: + if arg_inputs is None and inputs is None: raise AssertionError("'arg_inputs' and 'inputs' should not both be None.") - elif arg_inputs and inputs: + elif arg_inputs is not None and inputs is not None: raise AssertionError( "'arg_inputs' and 'inputs' should not be used at the same time." ) diff --git a/docs/_modules/torch_tensorrt/dynamo/_exporter.html b/docs/_modules/torch_tensorrt/dynamo/_exporter.html index 7beef6382d..8e3164aa31 100644 --- a/docs/_modules/torch_tensorrt/dynamo/_exporter.html +++ b/docs/_modules/torch_tensorrt/dynamo/_exporter.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo._exporter — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + torch_tensorrt.dynamo._exporter — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -272,7 +272,7 @@
      - v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
      @@ -313,6 +313,9 @@
    • Deploying Torch-TensorRT Programs
    • DLA
    • Torch Compile Advanced Usage
    • +
    • Deploy Quantized Models using Torch-TensorRT
    • +
    • Engine Caching
    • +
    • Refitting Torch-TensorRT Programs with New Weights

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -338,7 +341,6 @@
    • Example notebooks
    • Compiling ResNet using the Torch-TensorRT torch.compile Backend
    • Compiling a Transformer using torch.compile and TensorRT
    • -
    • Torch Compile Advanced Usage
    • Torch Compile Stable Diffusion
    • Torch Export with Cudagraphs
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • @@ -459,6 +461,7 @@

      Source code for torch_tensorrt.dynamo._settings

      < from torch_tensorrt._enums import EngineCapability, dtype from torch_tensorrt.dynamo._defaults import ( ASSUME_DYNAMIC_SHAPE_SUPPORT, + CACHE_BUILT_ENGINES, DEBUG, DISABLE_TF32, DLA_GLOBAL_DRAM_SIZE, @@ -477,6 +480,7 @@

      Source code for torch_tensorrt.dynamo._settings

      < OPTIMIZATION_LEVEL, PASS_THROUGH_BUILD_FAILURES, REQUIRE_FULL_COMPILATION, + REUSE_CACHED_ENGINES, SPARSE_WEIGHTS, TIMING_CACHE_PATH, TRUNCATE_DOUBLE, @@ -527,6 +531,8 @@

      Source code for torch_tensorrt.dynamo._settings

      < output to a file if a string path is specified hardware_compatible (bool): Build the TensorRT engines compatible with GPU architectures other than that of the GPU on which the engine was built (currently works for NVIDIA Ampere and newer) timing_cache_path (str): Path to the timing cache if it exists (or) where it will be saved after compilation + cache_built_engines (bool): Whether to save the compiled TRT engines to storage + reuse_cached_engines (bool): Whether to load the compiled TRT engines from storage """ enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS) @@ -558,7 +564,9 @@

      Source code for torch_tensorrt.dynamo._settings

      < dryrun: Union[bool, str] = DRYRUN hardware_compatible: bool = HARDWARE_COMPATIBLE timing_cache_path: str = TIMING_CACHE_PATH - lazy_engine_init: bool = LAZY_ENGINE_INIT
      + lazy_engine_init: bool = LAZY_ENGINE_INIT + cache_built_engines: bool = CACHE_BUILT_ENGINES + reuse_cached_engines: bool = REUSE_CACHED_ENGINES
      diff --git a/docs/_modules/torch_tensorrt/dynamo/_tracer.html b/docs/_modules/torch_tensorrt/dynamo/_tracer.html index aeda025161..605fa34caf 100644 --- a/docs/_modules/torch_tensorrt/dynamo/_tracer.html +++ b/docs/_modules/torch_tensorrt/dynamo/_tracer.html @@ -9,7 +9,7 @@ - torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + torch_tensorrt.dynamo._tracer — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -272,7 +272,7 @@
      - v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
      @@ -313,6 +313,9 @@
    • Deploying Torch-TensorRT Programs
    • DLA
    • Torch Compile Advanced Usage
    • +
    • Deploy Quantized Models using Torch-TensorRT
    • +
    • Engine Caching
    • +
    • Refitting Torch-TensorRT Programs with New Weights

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -338,7 +341,6 @@
    • Example notebooks
    • Compiling ResNet using the Torch-TensorRT torch.compile Backend
    • Compiling a Transformer using torch.compile and TensorRT
    • -
    • Torch Compile Advanced Usage
    • Torch Compile Stable Diffusion
    • Torch Export with Cudagraphs
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • diff --git a/docs/_sources/index.rst.txt b/docs/_sources/index.rst.txt index da5ee3d690..757acc2011 100644 --- a/docs/_sources/index.rst.txt +++ b/docs/_sources/index.rst.txt @@ -44,13 +44,14 @@ User Guide :hidden: user_guide/torch_tensorrt_explained - user_guide/getting_started user_guide/dynamic_shapes user_guide/saving_models user_guide/runtime user_guide/using_dla tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage - tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq + tutorials/_rendered_examples/dynamo/vgg16_ptq + tutorials/_rendered_examples/dynamo/engine_caching_example + tutorials/_rendered_examples/dynamo/refit_engine_example Dynamo Frontend ---------------- @@ -111,13 +112,11 @@ Tutorials tutorials/notebooks tutorials/_rendered_examples/dynamo/torch_compile_resnet_example tutorials/_rendered_examples/dynamo/torch_compile_transformers_example - tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion tutorials/_rendered_examples/dynamo/torch_export_cudagraphs tutorials/_rendered_examples/dynamo/custom_kernel_plugins tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2 tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion - tutorials/_rendered_examples/dynamo/vgg16_fp8_ptq tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example Python API Documentation diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt new file mode 100644 index 0000000000..e72f42cfb2 --- /dev/null +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_bert_example.rst.txt @@ -0,0 +1,127 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "tutorials/_rendered_examples/dynamo/engine_caching_bert_example.py" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + :ref:`Go to the end ` + to download the full example code + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py: + + +.. _engine_caching_bert_example: + +Engine Caching (BERT) +======================= + +Small caching example on BERT. + +.. GENERATED FROM PYTHON SOURCE LINES 10-76 + +.. code-block:: python + + + import numpy as np + import torch + import torch_tensorrt + from engine_caching_example import remove_timing_cache + from transformers import BertModel + + np.random.seed(0) + torch.manual_seed(0) + + model = BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval() + inputs = [ + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), + torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"), + ] + + + def compile_bert(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + # remove timing cache and reset dynamo for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compilation_kwargs = { + "use_python_runtime": False, + "enabled_precisions": {torch.float}, + "truncate_double": True, + "debug": False, + "min_block_size": 1, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache", + "engine_cache_size": 1 << 30, # 1GB + } + optimized_model = torch.compile( + model, + backend="torch_tensorrt", + options=compilation_kwargs, + ) + optimized_model(*inputs) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("-----compile bert-----> compilation time:\n", times, "milliseconds") + + + if __name__ == "__main__": + compile_bert() + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 0 minutes 0.000 seconds) + + +.. _sphx_glr_download_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py: + +.. only:: html + + .. container:: sphx-glr-footer sphx-glr-footer-example + + + + + .. container:: sphx-glr-download sphx-glr-download-python + + :download:`Download Python source code: engine_caching_bert_example.py ` + + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: engine_caching_bert_example.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt new file mode 100644 index 0000000000..df61bec65e --- /dev/null +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/engine_caching_example.rst.txt @@ -0,0 +1,361 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "tutorials/_rendered_examples/dynamo/engine_caching_example.py" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + :ref:`Go to the end ` + to download the full example code + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_example.py: + + +.. _engine_caching_example: + +Engine Caching +======================= + +As model sizes increase, the cost of compilation will as well. With AOT methods +like ``torch.dynamo.compile``, this cost is paid upfront. However if the weights +change, the session ends or you are using JIT methods like ``torch.compile``, as +graphs get invalidated they get re-compiled, this cost will get paid repeatedly. +Engine caching is a way to mitigate this cost by saving constructed engines to disk +and re-using them when possible. This tutorial demonstrates how to use engine caching +with TensorRT in PyTorch. Engine caching can significantly speed up subsequent model +compilations reusing previously built TensorRT engines. + +We'll explore two approaches: + + 1. Using torch_tensorrt.dynamo.compile + 2. Using torch.compile with the TensorRT backend + +The example uses a pre-trained ResNet18 model and shows the +differences between compilation without caching, with caching enabled, +and when reusing cached engines. + +.. GENERATED FROM PYTHON SOURCE LINES 26-52 + +.. code-block:: python + + + import os + from typing import Dict, Optional + + import numpy as np + import torch + import torch_tensorrt as torch_trt + import torchvision.models as models + from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH + from torch_tensorrt.dynamo._engine_cache import BaseEngineCache + + np.random.seed(0) + torch.manual_seed(0) + + model = models.resnet18(pretrained=True).eval().to("cuda") + enabled_precisions = {torch.float} + debug = False + min_block_size = 1 + use_python_runtime = False + + + def remove_timing_cache(path=TIMING_CACHE_PATH): + if os.path.exists(path): + os.remove(path) + + + +.. GENERATED FROM PYTHON SOURCE LINES 53-67 + +Engine Caching for JIT Compilation +---------------------------------- + +The primary goal of engine caching is to help speed up JIT workflows. ``torch.compile`` +provides a great deal of flexibility in model construction which makes it a good +first tool to try when looking to speed up your workflow. However, historically +the cost of compilation and in particular recompilation has been a barrier to entry +for many users. If for some reason a subgraph gets invalidated, that graph is reconstructed +scratch prior to the addition of engine caching. Now as engines are constructed, with ``cache_built_engines=True``, +engines are saved to disk tied to a hash of their corresponding PyTorch subgraph. If +in a subsequent compilation, either as part of this session or a new session, the cache will +pull the built engine and **refit** the weights which can reduce compilation times by orders of magnitude. +As such, in order to insert a new engine into the cache (i.e. ``cache_built_engines=True``), +the engine must be refitable (``make_refittable=True``). See :ref:`refit_engine_example` for more details. + +.. GENERATED FROM PYTHON SOURCE LINES 67-118 + +.. code-block:: python + + + + def torch_compile(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + # remove timing cache and reset dynamo just for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": enabled_precisions, + "debug": debug, + "min_block_size": min_block_size, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + }, + ) + compiled_model(*inputs) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("----------------torch_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") + + + torch_compile() + + +.. GENERATED FROM PYTHON SOURCE LINES 119-124 + +Engine Caching for AOT Compilation +---------------------------------- +Similarly to the JIT workflow, AOT workflows can benefit from engine caching. +As the same architecture or common subgraphs get recompiled, the cache will pull +previously built engines and refit the weights. + +.. GENERATED FROM PYTHON SOURCE LINES 124-178 + +.. code-block:: python + + + + def dynamo_compile(iterations=3): + times = [] + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),) + # Mark the dim0 of inputs as dynamic + batch = torch.export.Dim("batch", min=1, max=200) + exp_program = torch.export.export( + model, args=example_inputs, dynamic_shapes={"x": {0: batch}} + ) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")] + remove_timing_cache() # remove timing cache just for engine caching messurement + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + trt_gm = torch_trt.dynamo.compile( + exp_program, + tuple(inputs), + use_python_runtime=use_python_runtime, + enabled_precisions=enabled_precisions, + debug=debug, + min_block_size=min_block_size, + make_refitable=True, + cache_built_engines=cache_built_engines, + reuse_cached_engines=reuse_cached_engines, + engine_cache_size=1 << 30, # 1GB + ) + # output = trt_gm(*inputs) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("----------------dynamo_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") + + + dynamo_compile() + + +.. GENERATED FROM PYTHON SOURCE LINES 179-195 + +Custom Engine Cache +---------------------- + +By default, the engine cache is stored in the system's temporary directory. Both the cache directory and +size limit can be customized by passing ``engine_cache_dir`` and ``engine_cache_size``. +Users can also define their own engine cache implementation by extending the ``BaseEngineCache`` class. +This allows for remote or shared caching if so desired. + +The custom engine cache should implement the following methods: + - ``save``: Save the engine blob to the cache. + - ``load``: Load the engine blob from the cache. + +The hash provided by the cache systen is a weight agnostic hash of the originating PyTorch subgraph (post lowering). +The blob contains a serialized engine, calling spec data, and weight map information in the pickle format + +Below is an example of a custom engine cache implementation that implents a ``RAMEngineCache``. + +.. GENERATED FROM PYTHON SOURCE LINES 195-289 + +.. code-block:: python + + + + class RAMEngineCache(BaseEngineCache): + def __init__( + self, + ) -> None: + """ + Constructs a user held engine cache in memory. + """ + self.engine_cache: Dict[str, bytes] = {} + + def save( + self, + hash: str, + blob: bytes, + ): + """ + Insert the engine blob to the cache. + + Args: + hash (str): The hash key to associate with the engine blob. + blob (bytes): The engine blob to be saved. + + Returns: + None + """ + self.engine_cache[hash] = blob + + def load(self, hash: str) -> Optional[bytes]: + """ + Load the engine blob from the cache. + + Args: + hash (str): The hash key of the engine to load. + + Returns: + Optional[bytes]: The engine blob if found, None otherwise. + """ + if hash in self.engine_cache: + return self.engine_cache[hash] + else: + return None + + + def torch_compile_my_cache(iterations=3): + times = [] + engine_cache = RAMEngineCache() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + # The 1st iteration is to measure the compilation time without engine caching + # The 2nd and 3rd iterations are to measure the compilation time with engine caching. + # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration. + # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine. + for i in range(iterations): + inputs = [torch.rand((100, 3, 224, 224)).to("cuda")] + # remove timing cache and reset dynamo just for engine caching messurement + remove_timing_cache() + torch._dynamo.reset() + + if i == 0: + cache_built_engines = False + reuse_cached_engines = False + else: + cache_built_engines = True + reuse_cached_engines = True + + start.record() + compiled_model = torch.compile( + model, + backend="tensorrt", + options={ + "use_python_runtime": True, + "enabled_precisions": enabled_precisions, + "debug": debug, + "min_block_size": min_block_size, + "make_refitable": True, + "cache_built_engines": cache_built_engines, + "reuse_cached_engines": reuse_cached_engines, + "custom_engine_cache": engine_cache, + }, + ) + compiled_model(*inputs) # trigger the compilation + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) + + print("----------------torch_compile----------------") + print("disable engine caching, used:", times[0], "ms") + print("enable engine caching to cache engines, used:", times[1], "ms") + print("enable engine caching to reuse engines, used:", times[2], "ms") + + + torch_compile_my_cache() + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 0 minutes 0.000 seconds) + + +.. _sphx_glr_download_tutorials__rendered_examples_dynamo_engine_caching_example.py: + +.. only:: html + + .. container:: sphx-glr-footer sphx-glr-footer-example + + + + + .. container:: sphx-glr-download sphx-glr-download-python + + :download:`Download Python source code: engine_caching_example.py ` + + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: engine_caching_example.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt index 6e5917ae7b..64ecdc59aa 100644 --- a/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/index.rst.txt @@ -19,6 +19,8 @@ a number of ways you can leverage this backend to accelerate inference. * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights * :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile`` +* :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times +* :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT @@ -61,6 +63,23 @@ a number of ways you can leverage this backend to accelerate inference. +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py` + +.. raw:: html + +
      Refitting Torch-TensorRT Programs with New Weights
      +
      + + .. raw:: html
      @@ -80,18 +99,18 @@ a number of ways you can leverage this backend to accelerate inference. .. raw:: html -
      +
      .. only:: html - .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_gpt2_thumb.png :alt: - :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py` + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py` .. raw:: html -
      Refit TenorRT Graph Module with Torch-TensorRT
      +
      Compiling GPT2 using the Torch-TensorRT with dynamo backend
      @@ -112,6 +131,40 @@ a number of ways you can leverage this backend to accelerate inference.
      +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_llama2_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py` + +.. raw:: html + +
      Compiling Llama2 using the Torch-TensorRT with dynamo backend
      +
      + + +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py` + +.. raw:: html + +
      Engine Caching (BERT)
      +
      + + .. raw:: html
      @@ -163,6 +216,23 @@ a number of ways you can leverage this backend to accelerate inference.
      +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_example.py` + +.. raw:: html + +
      Engine Caching
      +
      + + .. raw:: html
      @@ -190,11 +260,15 @@ a number of ways you can leverage this backend to accelerate inference. /tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion /tutorials/_rendered_examples/dynamo/torch_export_cudagraphs - /tutorials/_rendered_examples/dynamo/torch_compile_transformers_example /tutorials/_rendered_examples/dynamo/refit_engine_example + /tutorials/_rendered_examples/dynamo/torch_compile_transformers_example + /tutorials/_rendered_examples/dynamo/torch_export_gpt2 /tutorials/_rendered_examples/dynamo/torch_compile_advanced_usage + /tutorials/_rendered_examples/dynamo/torch_export_llama2 + /tutorials/_rendered_examples/dynamo/engine_caching_bert_example /tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example /tutorials/_rendered_examples/dynamo/torch_compile_resnet_example /tutorials/_rendered_examples/dynamo/vgg16_ptq + /tutorials/_rendered_examples/dynamo/engine_caching_example /tutorials/_rendered_examples/dynamo/custom_kernel_plugins diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt index cc0b9fd21e..fb48bc8536 100644 --- a/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/refit_engine_example.rst.txt @@ -20,31 +20,38 @@ .. _refit_engine_example: -Refit TenorRT Graph Module with Torch-TensorRT +Refitting Torch-TensorRT Programs with New Weights =================================================================== -We are going to demonstrate how a compiled TensorRT Graph Module can be refitted with updated weights. - -In many cases, we frequently update the weights of models, such as applying various LoRA to Stable Diffusion or constant A/B testing of AI products. -That poses challenges for TensorRT inference optimizations, as compiling the TensorRT engines takes significant time, making repetitive compilation highly inefficient. -Torch-TensorRT supports refitting TensorRT graph modules without re-compiling the engine, considerably accelerating the workflow. +Compilation is an expensive operation as it involves many graph transformations, translations +and optimizations applied on the model. In cases were the weights of a model might be updated +occasionally (e.g. inserting LoRA adapters), the large cost of recompilation can make it infeasible +to use TensorRT if the compiled program needed to be built from scratch each time. Torch-TensorRT +provides a PyTorch native mechanism to update the weights of a compiled TensorRT program without +recompiling from scratch through weight refitting. In this tutorial, we are going to walk through -1. Compiling a PyTorch model to a TensorRT Graph Module -2. Save and load a graph module -3. Refit the graph module -.. GENERATED FROM PYTHON SOURCE LINES 20-22 + 1. Compiling a PyTorch model to a TensorRT Graph Module + 2. Save and load a graph module + 3. Refit the graph module + +This tutorial focuses mostly on the AOT workflow where it is most likely that a user might need to +manually refit a module. In the JIT workflow, weight changes trigger recompilation. As the engine +has previously been built, with an engine cache enabled, Torch-TensorRT can automatically recognize +a previously built engine, trigger refit and short cut recompilation on behalf of the user (see: :ref:`engine_caching_example`). + +.. GENERATED FROM PYTHON SOURCE LINES 27-29 Standard Workflow ----------------------------- -.. GENERATED FROM PYTHON SOURCE LINES 24-26 +.. GENERATED FROM PYTHON SOURCE LINES 31-33 Imports and model definition ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. GENERATED FROM PYTHON SOURCE LINES 26-38 +.. GENERATED FROM PYTHON SOURCE LINES 33-45 .. code-block:: python @@ -61,17 +68,24 @@ Imports and model definition -.. GENERATED FROM PYTHON SOURCE LINES 39-41 +.. GENERATED FROM PYTHON SOURCE LINES 46-55 -Compile the module for the first time and save it. -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Make a Refitable Compilation Program +--------------------------------------- -.. GENERATED FROM PYTHON SOURCE LINES 41-66 +The inital step is to compile a module and save it as with a normal. Note that there is an +additional parameter `make_refitable` that is set to `True`. This parameter is used to +indicate that the engine being built should support weight refitting later. Engines built without +these setttings will not be able to be refit. + +In this case we are going to compile a ResNet18 model with randomly initialized weights and save it. + +.. GENERATED FROM PYTHON SOURCE LINES 55-79 .. code-block:: python - model = models.resnet18(pretrained=True).eval().to("cuda") + model = models.resnet18(pretrained=False).eval().to("cuda") exp_program = torch.export.export(model, tuple(inputs)) enabled_precisions = {torch.float} debug = False @@ -91,23 +105,27 @@ Compile the module for the first time and save it. ) # Output is a torch.fx.GraphModule # Save the graph module as an exported program - # This is only supported when use_python_runtime = False torch_trt.save(trt_gm, "./compiled.ep", inputs=inputs) -.. GENERATED FROM PYTHON SOURCE LINES 67-69 +.. GENERATED FROM PYTHON SOURCE LINES 80-87 -Refit the module with update model weights -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Refit the Program with Pretrained Weights +------------------------------------------ -.. GENERATED FROM PYTHON SOURCE LINES 69-93 +Random weights are not useful for inference. But now instead of recompiling the model, we can +refit the model with the pretrained weights. This is done by setting up another PyTorch module +with the target weights and exporting it as an ExportedProgram. Then the ``refit_module_weights`` +function is used to update the weights of the compiled module with the new weights. + +.. GENERATED FROM PYTHON SOURCE LINES 87-111 .. code-block:: python # Create and compile the updated model - model2 = models.resnet18(pretrained=False).eval().to("cuda") + model2 = models.resnet18(pretrained=True).eval().to("cuda") exp_program2 = torch.export.export(model2, tuple(inputs)) @@ -130,18 +148,36 @@ Refit the module with update model weights print("Refit successfully!") -.. GENERATED FROM PYTHON SOURCE LINES 94-96 +.. GENERATED FROM PYTHON SOURCE LINES 112-140 -Alternative Workflow using Python Runtime +Advanced Usage ----------------------------- -.. GENERATED FROM PYTHON SOURCE LINES 96-99 +There are a number of settings you can use to control the refit process -.. code-block:: python +Weight Map Cache +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Weight refitting works by matching the weights of the compiled module with the new weights from +the user supplied ExportedProgram. Since 1:1 name matching from PyTorch to TensorRT is hard to accomplish, +the only gaurenteed way to match weights at *refit-time* is to pass the new ExportedProgram through the +early phases of the compilation process to generate near identical weight names. This can be expensive +and is not always necessary. +To avoid this, **At initial compile**, Torch-TensorRt will attempt to cache a direct mapping from PyTorch +weights to TensorRT weights. This cache is stored in the compiled module as metadata and can be used +to speed up refit. If the cache is not present, the refit system will fallback to rebuilding the mapping at +refit-time. Use of this cache is controlled by the ``use_weight_map_cache`` parameter. + +Since the cache uses a heuristic based system for matching PyTorch and TensorRT weights, you may want to verify the refitting. This can be done by setting +``verify_output`` to True and providing sample ``arg_inputs`` and ``kwarg_inputs``. When this is done, the refit +system will run the refitted module and the user supplied module on the same inputs and compare the outputs. + +In-Place Refit +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - # Currently python runtime does not support engine serialization. So the refitting will be done in the same runtime. - # This usecase is more useful when you need to switch different weights in the same runtime, such as using Stable Diffusion. +``in_place`` allows the user to refit the module in place. This is useful when the user wants to update the weights +of the compiled module without creating a new module. .. rst-class:: sphx-glr-timing diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_gpt2.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_gpt2.rst.txt new file mode 100644 index 0000000000..be90efc337 --- /dev/null +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_gpt2.rst.txt @@ -0,0 +1,168 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "tutorials/_rendered_examples/dynamo/torch_export_gpt2.py" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + :ref:`Go to the end ` + to download the full example code + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py: + + +.. _torch_export_gpt2: + +Compiling GPT2 using the Torch-TensorRT with dynamo backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a GPT2 model. + +.. GENERATED FROM PYTHON SOURCE LINES 10-12 + +Imports and Model Definition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. GENERATED FROM PYTHON SOURCE LINES 12-17 + +.. code-block:: python + + import torch + import torch_tensorrt + from transformers import AutoModelForCausalLM, AutoTokenizer + from utils import export_llm, generate + + +.. GENERATED FROM PYTHON SOURCE LINES 18-35 + +.. code-block:: python + + + # Define the parameters and initialize the model + MAX_TOKENS = 32 + DEVICE = torch.device("cuda:0") + + # Define the GPT2 model from hugging face + # kv_cache is not supported in Torch-TRT currently. + # CPU is used here so that GPU memory is reserved for TRT compilation. + with torch.no_grad(): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", + ).eval() + + +.. GENERATED FROM PYTHON SOURCE LINES 36-37 + +Tokenize a sample input prompt and get pytorch model outputs + +.. GENERATED FROM PYTHON SOURCE LINES 37-46 + +.. code-block:: python + + prompt = "I enjoy walking with my cute dog" + model_inputs = tokenizer(prompt, return_tensors="pt") + input_ids = model_inputs["input_ids"] + + # Auto-regressive generation loop for greedy decoding using PyTorch model + # We use a custom generate function which is very similar to the huggingface one. + pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + + + +.. GENERATED FROM PYTHON SOURCE LINES 47-49 + +Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. GENERATED FROM PYTHON SOURCE LINES 49-67 + +.. code-block:: python + + + # Export the GPT2 model into an ExportedProgram which is input of TRT compilation + gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) + trt_model = torch_tensorrt.dynamo.compile( + gpt2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + truncate_double=True, + device=DEVICE, + disable_tf32=True, + ) + + # Auto-regressive generation loop for greedy decoding using TensorRT model + # We use a custom generate function which is very similar to the huggingface one. + # Move inputs to GPU + input_ids = input_ids.to(DEVICE) + trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + + +.. GENERATED FROM PYTHON SOURCE LINES 68-70 + +Decode the output sentences of PyTorch and TensorRT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. GENERATED FROM PYTHON SOURCE LINES 70-81 + +.. code-block:: python + + print("=============================") + print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), + ) + print("=============================") + print( + "TensorRT model generated text: ", + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), + ) + + +.. GENERATED FROM PYTHON SOURCE LINES 82-87 + +The output sentences should look like +============================= +Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +============================= +TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 0 minutes 0.000 seconds) + + +.. _sphx_glr_download_tutorials__rendered_examples_dynamo_torch_export_gpt2.py: + +.. only:: html + + .. container:: sphx-glr-footer sphx-glr-footer-example + + + + + .. container:: sphx-glr-download sphx-glr-download-python + + :download:`Download Python source code: torch_export_gpt2.py ` + + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: torch_export_gpt2.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_llama2.rst.txt b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_llama2.rst.txt new file mode 100644 index 0000000000..5e66a72aab --- /dev/null +++ b/docs/_sources/tutorials/_rendered_examples/dynamo/torch_export_llama2.rst.txt @@ -0,0 +1,175 @@ + +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "tutorials/_rendered_examples/dynamo/torch_export_llama2.py" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. only:: html + + .. note:: + :class: sphx-glr-download-link-note + + :ref:`Go to the end ` + to download the full example code + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py: + + +.. _torch_export_llama2: + +Compiling Llama2 using the Torch-TensorRT with dynamo backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a Llama2 model. + +.. GENERATED FROM PYTHON SOURCE LINES 10-12 + +Imports and Model Definition +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. GENERATED FROM PYTHON SOURCE LINES 12-17 + +.. code-block:: python + + import torch + import torch_tensorrt + from transformers import AutoModelForCausalLM, AutoTokenizer + from utils import export_llm, generate + + +.. GENERATED FROM PYTHON SOURCE LINES 18-19 + +Define the parameters and initialize the model + +.. GENERATED FROM PYTHON SOURCE LINES 19-33 + +.. code-block:: python + + MAX_TOKENS = 32 + DEVICE = torch.device("cuda:0") + + # Define the Llama2 model from hugging face + # kv_cache is not supported in Torch-TRT currently. + # CPU is used here so that GPU memory is reserved for TRT compilation. + llama_path = "meta-llama/Llama-2-7b-chat-hf" + with torch.no_grad(): + model = AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ).eval() + + tokenizer = AutoTokenizer.from_pretrained(llama_path) + + +.. GENERATED FROM PYTHON SOURCE LINES 34-35 + +Tokenize a sample input prompt and get pytorch model outputs + +.. GENERATED FROM PYTHON SOURCE LINES 35-43 + +.. code-block:: python + + prompt = "What is dynamic programming?" + model_inputs = tokenizer(prompt, return_tensors="pt") + input_ids = model_inputs.input_ids + + # Auto-regressive generation loop for greedy decoding using PyTorch model + # We use a custom generate function which is very similar to the huggingface one. + pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + + +.. GENERATED FROM PYTHON SOURCE LINES 44-46 + +Compilation with `Torch-TensorRT` using dynamo backend and generate TensorRT outputs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. GENERATED FROM PYTHON SOURCE LINES 46-65 + +.. code-block:: python + + + # Export the llama2 model into an ExportedProgram which is input of TRT compilation + llama2_ep = export_llm(model, input_ids, max_seq_len=64) + trt_model = torch_tensorrt.dynamo.compile( + llama2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + min_block_size=1, + truncate_double=True, + device=DEVICE, + disable_tf32=True, + ) + + # Auto-regressive generation loop for greedy decoding using TensorRT model + # We use a custom generate function which is very similar to the huggingface one. + # Move inputs to GPU + input_ids = input_ids.to(DEVICE) + trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + + +.. GENERATED FROM PYTHON SOURCE LINES 66-68 + +Decode the output sentences of PyTorch and TensorRT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. GENERATED FROM PYTHON SOURCE LINES 68-85 + +.. code-block:: python + + print("=============================") + print( + "Pytorch model generated text: ", + tokenizer.batch_decode( + pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0], + ) + print("=============================") + print( + "TensorRT model generated text: ", + tokenizer.batch_decode( + trt_gen_tokens, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + )[0], + ) + + +.. GENERATED FROM PYTHON SOURCE LINES 86-91 + +The output sentences should look like +============================= +Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my +============================= +TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll ever be able to walk with my + + +.. rst-class:: sphx-glr-timing + + **Total running time of the script:** ( 0 minutes 0.000 seconds) + + +.. _sphx_glr_download_tutorials__rendered_examples_dynamo_torch_export_llama2.py: + +.. only:: html + + .. container:: sphx-glr-footer sphx-glr-footer-example + + + + + .. container:: sphx-glr-download sphx-glr-download-python + + :download:`Download Python source code: torch_export_llama2.py ` + + .. container:: sphx-glr-download sphx-glr-download-jupyter + + :download:`Download Jupyter notebook: torch_export_llama2.ipynb ` + + +.. only:: html + + .. rst-class:: sphx-glr-signature + + `Gallery generated by Sphinx-Gallery `_ diff --git a/docs/_sources/tutorials/_rendered_examples/index.rst.txt b/docs/_sources/tutorials/_rendered_examples/index.rst.txt index f68c1fb417..c688d7370f 100644 --- a/docs/_sources/tutorials/_rendered_examples/index.rst.txt +++ b/docs/_sources/tutorials/_rendered_examples/index.rst.txt @@ -35,6 +35,8 @@ a number of ways you can leverage this backend to accelerate inference. * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights * :ref:`mutable_torchtrt_module_example`: Compile, use, and modify TensorRT Graph Module with MutableTorchTensorRTModule * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile`` +* :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times +* :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT @@ -77,6 +79,23 @@ a number of ways you can leverage this backend to accelerate inference.
      +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py` + +.. raw:: html + +
      Refitting Torch-TensorRT Programs with New Weights
      +
      + + .. raw:: html
      @@ -96,18 +115,18 @@ a number of ways you can leverage this backend to accelerate inference. .. raw:: html -
      +
      .. only:: html - .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_refit_engine_example_thumb.png + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_gpt2_thumb.png :alt: - :ref:`sphx_glr_tutorials__rendered_examples_dynamo_refit_engine_example.py` + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_gpt2.py` .. raw:: html -
      Refit TenorRT Graph Module with Torch-TensorRT
      +
      Compiling GPT2 using the Torch-TensorRT with dynamo backend
      @@ -128,6 +147,40 @@ a number of ways you can leverage this backend to accelerate inference.
      +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_torch_export_llama2_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_torch_export_llama2.py` + +.. raw:: html + +
      Compiling Llama2 using the Torch-TensorRT with dynamo backend
      +
      + + +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_bert_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_bert_example.py` + +.. raw:: html + +
      Engine Caching (BERT)
      +
      + + .. raw:: html
      @@ -179,6 +232,23 @@ a number of ways you can leverage this backend to accelerate inference.
      +.. raw:: html + +
      + +.. only:: html + + .. image:: /tutorials/_rendered_examples/dynamo/images/thumb/sphx_glr_engine_caching_example_thumb.png + :alt: + + :ref:`sphx_glr_tutorials__rendered_examples_dynamo_engine_caching_example.py` + +.. raw:: html + +
      Engine Caching
      +
      + + .. raw:: html
      diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js index c01a50945c..05b784e0ae 100644 --- a/docs/_static/documentation_options.js +++ b/docs/_static/documentation_options.js @@ -1,6 +1,6 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: 'v2.5.0.dev0+b3a8cdd', + VERSION: 'v2.5.0.dev0+1d0916f', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/docs/cli/torchtrtc.html b/docs/cli/torchtrtc.html index f6792e2ae9..8fea8f6f37 100644 --- a/docs/cli/torchtrtc.html +++ b/docs/cli/torchtrtc.html @@ -10,7 +10,7 @@ - torchtrtc — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + torchtrtc — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -275,7 +275,7 @@
      - v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
      @@ -316,6 +316,9 @@
    • Deploying Torch-TensorRT Programs
    • DLA
    • Torch Compile Advanced Usage
    • +
    • Deploy Quantized Models using Torch-TensorRT
    • +
    • Engine Caching
    • +
    • Refitting Torch-TensorRT Programs with New Weights

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

    Dynamo Frontend

      @@ -341,7 +344,6 @@
    • Example notebooks
    • Compiling ResNet using the Torch-TensorRT torch.compile Backend
    • Compiling a Transformer using torch.compile and TensorRT
    • -
    • Torch Compile Advanced Usage
    • Torch Compile Stable Diffusion
    • Torch Export with Cudagraphs
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • diff --git a/docs/dynamo/torch_compile.html b/docs/dynamo/torch_compile.html index a0531ea32e..78991a4b45 100644 --- a/docs/dynamo/torch_compile.html +++ b/docs/dynamo/torch_compile.html @@ -10,7 +10,7 @@ - TensorRT Backend for torch.compile — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + TensorRT Backend for torch.compile — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -40,7 +40,7 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      + + + + + +
      +
      +
      + + + + + + + + + + + +
      +
      +
      + + + + + + + + + + + + + + + + +
      + +
        + +
      • + + + Docs + + > +
      • + + +
      • Engine Caching (BERT)
      • + + +
      • + + + + + +
      • + +
      + + +
      +
      + +
      + Shortcuts +
      +
      + +
      +
      + + + + + + +
      + +
      +
      + + +
      +

      Engine Caching (BERT)

      +

      Small caching example on BERT.

      +
      import numpy as np
      +import torch
      +import torch_tensorrt
      +from engine_caching_example import remove_timing_cache
      +from transformers import BertModel
      +
      +np.random.seed(0)
      +torch.manual_seed(0)
      +
      +model = BertModel.from_pretrained("bert-base-uncased", return_dict=False).cuda().eval()
      +inputs = [
      +    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
      +    torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda"),
      +]
      +
      +
      +def compile_bert(iterations=3):
      +    times = []
      +    start = torch.cuda.Event(enable_timing=True)
      +    end = torch.cuda.Event(enable_timing=True)
      +
      +    # The 1st iteration is to measure the compilation time without engine caching
      +    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
      +    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
      +    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
      +    for i in range(iterations):
      +        # remove timing cache and reset dynamo for engine caching messurement
      +        remove_timing_cache()
      +        torch._dynamo.reset()
      +
      +        if i == 0:
      +            cache_built_engines = False
      +            reuse_cached_engines = False
      +        else:
      +            cache_built_engines = True
      +            reuse_cached_engines = True
      +
      +        start.record()
      +        compilation_kwargs = {
      +            "use_python_runtime": False,
      +            "enabled_precisions": {torch.float},
      +            "truncate_double": True,
      +            "debug": False,
      +            "min_block_size": 1,
      +            "make_refitable": True,
      +            "cache_built_engines": cache_built_engines,
      +            "reuse_cached_engines": reuse_cached_engines,
      +            "engine_cache_dir": "/tmp/torch_trt_bert_engine_cache",
      +            "engine_cache_size": 1 << 30,  # 1GB
      +        }
      +        optimized_model = torch.compile(
      +            model,
      +            backend="torch_tensorrt",
      +            options=compilation_kwargs,
      +        )
      +        optimized_model(*inputs)
      +        end.record()
      +        torch.cuda.synchronize()
      +        times.append(start.elapsed_time(end))
      +
      +    print("-----compile bert-----> compilation time:\n", times, "milliseconds")
      +
      +
      +if __name__ == "__main__":
      +    compile_bert()
      +
      +
      +

      Total running time of the script: ( 0 minutes 0.000 seconds)

      + +

      Gallery generated by Sphinx-Gallery

      +
      + + +
      + +
      +
      + + + + +
      + + + +
      +

      + © Copyright 2024, NVIDIA Corporation. + +

      +
      + +
      + Built with Sphinx using a theme provided by Read the Docs. +
      + + +
      + +
      +
      + +
      +
      + +
      +
      +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +

      Docs

      +

      Access comprehensive developer documentation for PyTorch

      + View Docs +
      + +
      +

      Tutorials

      +

      Get in-depth tutorials for beginners and advanced developers

      + View Tutorials +
      + +
      +

      Resources

      +

      Find development resources and get your questions answered

      + View Resources +
      +
      +
      +
      + + + + + + + + + +
      +
      +
      +
      + + +
      +
      +
      + + +
      + + + + + + + + \ No newline at end of file diff --git a/docs/tutorials/_rendered_examples/dynamo/engine_caching_example.html b/docs/tutorials/_rendered_examples/dynamo/engine_caching_example.html new file mode 100644 index 0000000000..78ee023bc2 --- /dev/null +++ b/docs/tutorials/_rendered_examples/dynamo/engine_caching_example.html @@ -0,0 +1,1093 @@ + + + + + + + + + + + + + Engine Caching — Torch-TensorRT v2.5.0.dev0+1d0916f documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      + + + + + +
      +
      +
      + + + + + + + + + + + +
      +
      +
      + + + + + + + + + + + + + + + + +
      + + + + +
      +
      + +
      + Shortcuts +
      +
      + +
      +
      + + + + + + +
      + +
      +
      + + +
      +

      Engine Caching

      +

      As model sizes increase, the cost of compilation will as well. With AOT methods +like torch.dynamo.compile, this cost is paid upfront. However if the weights +change, the session ends or you are using JIT methods like torch.compile, as +graphs get invalidated they get re-compiled, this cost will get paid repeatedly. +Engine caching is a way to mitigate this cost by saving constructed engines to disk +and re-using them when possible. This tutorial demonstrates how to use engine caching +with TensorRT in PyTorch. Engine caching can significantly speed up subsequent model +compilations reusing previously built TensorRT engines.

      +

      We’ll explore two approaches:

      +
      +
        +
      1. Using torch_tensorrt.dynamo.compile

      2. +
      3. Using torch.compile with the TensorRT backend

      4. +
      +
      +

      The example uses a pre-trained ResNet18 model and shows the +differences between compilation without caching, with caching enabled, +and when reusing cached engines.

      +
      import os
      +from typing import Dict, Optional
      +
      +import numpy as np
      +import torch
      +import torch_tensorrt as torch_trt
      +import torchvision.models as models
      +from torch_tensorrt.dynamo._defaults import TIMING_CACHE_PATH
      +from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
      +
      +np.random.seed(0)
      +torch.manual_seed(0)
      +
      +model = models.resnet18(pretrained=True).eval().to("cuda")
      +enabled_precisions = {torch.float}
      +debug = False
      +min_block_size = 1
      +use_python_runtime = False
      +
      +
      +def remove_timing_cache(path=TIMING_CACHE_PATH):
      +    if os.path.exists(path):
      +        os.remove(path)
      +
      +
      +
      +

      Engine Caching for JIT Compilation

      +

      The primary goal of engine caching is to help speed up JIT workflows. torch.compile +provides a great deal of flexibility in model construction which makes it a good +first tool to try when looking to speed up your workflow. However, historically +the cost of compilation and in particular recompilation has been a barrier to entry +for many users. If for some reason a subgraph gets invalidated, that graph is reconstructed +scratch prior to the addition of engine caching. Now as engines are constructed, with cache_built_engines=True, +engines are saved to disk tied to a hash of their corresponding PyTorch subgraph. If +in a subsequent compilation, either as part of this session or a new session, the cache will +pull the built engine and refit the weights which can reduce compilation times by orders of magnitude. +As such, in order to insert a new engine into the cache (i.e. cache_built_engines=True), +the engine must be refitable (make_refittable=True). See Refitting Torch-TensorRT Programs with New Weights for more details.

      +
      def torch_compile(iterations=3):
      +    times = []
      +    start = torch.cuda.Event(enable_timing=True)
      +    end = torch.cuda.Event(enable_timing=True)
      +
      +    # The 1st iteration is to measure the compilation time without engine caching
      +    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
      +    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
      +    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
      +    for i in range(iterations):
      +        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
      +        # remove timing cache and reset dynamo just for engine caching messurement
      +        remove_timing_cache()
      +        torch._dynamo.reset()
      +
      +        if i == 0:
      +            cache_built_engines = False
      +            reuse_cached_engines = False
      +        else:
      +            cache_built_engines = True
      +            reuse_cached_engines = True
      +
      +        start.record()
      +        compiled_model = torch.compile(
      +            model,
      +            backend="tensorrt",
      +            options={
      +                "use_python_runtime": True,
      +                "enabled_precisions": enabled_precisions,
      +                "debug": debug,
      +                "min_block_size": min_block_size,
      +                "make_refitable": True,
      +                "cache_built_engines": cache_built_engines,
      +                "reuse_cached_engines": reuse_cached_engines,
      +            },
      +        )
      +        compiled_model(*inputs)  # trigger the compilation
      +        end.record()
      +        torch.cuda.synchronize()
      +        times.append(start.elapsed_time(end))
      +
      +    print("----------------torch_compile----------------")
      +    print("disable engine caching, used:", times[0], "ms")
      +    print("enable engine caching to cache engines, used:", times[1], "ms")
      +    print("enable engine caching to reuse engines, used:", times[2], "ms")
      +
      +
      +torch_compile()
      +
      +
      +
      +
      +

      Engine Caching for AOT Compilation

      +

      Similarly to the JIT workflow, AOT workflows can benefit from engine caching. +As the same architecture or common subgraphs get recompiled, the cache will pull +previously built engines and refit the weights.

      +
      def dynamo_compile(iterations=3):
      +    times = []
      +    start = torch.cuda.Event(enable_timing=True)
      +    end = torch.cuda.Event(enable_timing=True)
      +
      +    example_inputs = (torch.randn((100, 3, 224, 224)).to("cuda"),)
      +    # Mark the dim0 of inputs as dynamic
      +    batch = torch.export.Dim("batch", min=1, max=200)
      +    exp_program = torch.export.export(
      +        model, args=example_inputs, dynamic_shapes={"x": {0: batch}}
      +    )
      +
      +    # The 1st iteration is to measure the compilation time without engine caching
      +    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
      +    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
      +    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
      +    for i in range(iterations):
      +        inputs = [torch.rand((100 + i, 3, 224, 224)).to("cuda")]
      +        remove_timing_cache()  # remove timing cache just for engine caching messurement
      +        if i == 0:
      +            cache_built_engines = False
      +            reuse_cached_engines = False
      +        else:
      +            cache_built_engines = True
      +            reuse_cached_engines = True
      +
      +        start.record()
      +        trt_gm = torch_trt.dynamo.compile(
      +            exp_program,
      +            tuple(inputs),
      +            use_python_runtime=use_python_runtime,
      +            enabled_precisions=enabled_precisions,
      +            debug=debug,
      +            min_block_size=min_block_size,
      +            make_refitable=True,
      +            cache_built_engines=cache_built_engines,
      +            reuse_cached_engines=reuse_cached_engines,
      +            engine_cache_size=1 << 30,  # 1GB
      +        )
      +        # output = trt_gm(*inputs)
      +        end.record()
      +        torch.cuda.synchronize()
      +        times.append(start.elapsed_time(end))
      +
      +    print("----------------dynamo_compile----------------")
      +    print("disable engine caching, used:", times[0], "ms")
      +    print("enable engine caching to cache engines, used:", times[1], "ms")
      +    print("enable engine caching to reuse engines, used:", times[2], "ms")
      +
      +
      +dynamo_compile()
      +
      +
      +
      +
      +

      Custom Engine Cache

      +

      By default, the engine cache is stored in the system’s temporary directory. Both the cache directory and +size limit can be customized by passing engine_cache_dir and engine_cache_size. +Users can also define their own engine cache implementation by extending the BaseEngineCache class. +This allows for remote or shared caching if so desired.

      +
      +
      The custom engine cache should implement the following methods:
        +
      • save: Save the engine blob to the cache.

      • +
      • load: Load the engine blob from the cache.

      • +
      +
      +
      +

      The hash provided by the cache systen is a weight agnostic hash of the originating PyTorch subgraph (post lowering). +The blob contains a serialized engine, calling spec data, and weight map information in the pickle format

      +

      Below is an example of a custom engine cache implementation that implents a RAMEngineCache.

      +
      class RAMEngineCache(BaseEngineCache):
      +    def __init__(
      +        self,
      +    ) -> None:
      +        """
      +        Constructs a user held engine cache in memory.
      +        """
      +        self.engine_cache: Dict[str, bytes] = {}
      +
      +    def save(
      +        self,
      +        hash: str,
      +        blob: bytes,
      +    ):
      +        """
      +        Insert the engine blob to the cache.
      +
      +        Args:
      +            hash (str): The hash key to associate with the engine blob.
      +            blob (bytes): The engine blob to be saved.
      +
      +        Returns:
      +            None
      +        """
      +        self.engine_cache[hash] = blob
      +
      +    def load(self, hash: str) -> Optional[bytes]:
      +        """
      +        Load the engine blob from the cache.
      +
      +        Args:
      +            hash (str): The hash key of the engine to load.
      +
      +        Returns:
      +            Optional[bytes]: The engine blob if found, None otherwise.
      +        """
      +        if hash in self.engine_cache:
      +            return self.engine_cache[hash]
      +        else:
      +            return None
      +
      +
      +def torch_compile_my_cache(iterations=3):
      +    times = []
      +    engine_cache = RAMEngineCache()
      +    start = torch.cuda.Event(enable_timing=True)
      +    end = torch.cuda.Event(enable_timing=True)
      +
      +    # The 1st iteration is to measure the compilation time without engine caching
      +    # The 2nd and 3rd iterations are to measure the compilation time with engine caching.
      +    # Since the 2nd iteration needs to compile and save the engine, it will be slower than the 1st iteration.
      +    # The 3rd iteration should be faster than the 1st iteration because it loads the cached engine.
      +    for i in range(iterations):
      +        inputs = [torch.rand((100, 3, 224, 224)).to("cuda")]
      +        # remove timing cache and reset dynamo just for engine caching messurement
      +        remove_timing_cache()
      +        torch._dynamo.reset()
      +
      +        if i == 0:
      +            cache_built_engines = False
      +            reuse_cached_engines = False
      +        else:
      +            cache_built_engines = True
      +            reuse_cached_engines = True
      +
      +        start.record()
      +        compiled_model = torch.compile(
      +            model,
      +            backend="tensorrt",
      +            options={
      +                "use_python_runtime": True,
      +                "enabled_precisions": enabled_precisions,
      +                "debug": debug,
      +                "min_block_size": min_block_size,
      +                "make_refitable": True,
      +                "cache_built_engines": cache_built_engines,
      +                "reuse_cached_engines": reuse_cached_engines,
      +                "custom_engine_cache": engine_cache,
      +            },
      +        )
      +        compiled_model(*inputs)  # trigger the compilation
      +        end.record()
      +        torch.cuda.synchronize()
      +        times.append(start.elapsed_time(end))
      +
      +    print("----------------torch_compile----------------")
      +    print("disable engine caching, used:", times[0], "ms")
      +    print("enable engine caching to cache engines, used:", times[1], "ms")
      +    print("enable engine caching to reuse engines, used:", times[2], "ms")
      +
      +
      +torch_compile_my_cache()
      +
      +
      +

      Total running time of the script: ( 0 minutes 0.000 seconds)

      + +

      Gallery generated by Sphinx-Gallery

      +
      +
      + + +
      + +
      + + +
      +
      + + +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +

      Docs

      +

      Access comprehensive developer documentation for PyTorch

      + View Docs +
      + +
      +

      Tutorials

      +

      Get in-depth tutorials for beginners and advanced developers

      + View Tutorials +
      + +
      +

      Resources

      +

      Find development resources and get your questions answered

      + View Resources +
      +
      +
      +
      + + + + + + + + + +
      +
      +
      +
      + + +
      +
      +
      + + +
      + + + + + + + + \ No newline at end of file diff --git a/docs/tutorials/_rendered_examples/dynamo/index.html b/docs/tutorials/_rendered_examples/dynamo/index.html index be6b1c8ea8..5f683a8b29 100644 --- a/docs/tutorials/_rendered_examples/dynamo/index.html +++ b/docs/tutorials/_rendered_examples/dynamo/index.html @@ -10,7 +10,7 @@ - Dynamo / torch.compile — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + Dynamo / torch.compile — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -273,7 +273,7 @@
      - v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
      @@ -314,6 +314,9 @@
    • Deploying Torch-TensorRT Programs
    • DLA
    • Torch Compile Advanced Usage
    • +
    • Deploy Quantized Models using Torch-TensorRT
    • +
    • Engine Caching
    • +
    • Refitting Torch-TensorRT Programs with New Weights

    Dynamo Frontend

    Torch Compile Stable Diffusion

    @@ -474,15 +478,24 @@

    Torch Export with Cudagraphs

    Torch Export with Cudagraphs
    +
    +

    Refitting Torch-TensorRT Programs with New Weights

    +
    Refitting Torch-TensorRT Programs with New Weights

    Compiling a Transformer using torch.compile and TensorRT

    Compiling a Transformer using torch.compile and TensorRT
    -
    -

    Refit TenorRT Graph Module with Torch-TensorRT

    -
    Refit TenorRT Graph Module with Torch-TensorRT
    +
    +

    Compiling GPT2 using the Torch-TensorRT with dynamo backend

    +
    Compiling GPT2 using the Torch-TensorRT with dynamo backend

    Torch Compile Advanced Usage

    Torch Compile Advanced Usage
    +
    +

    Compiling Llama2 using the Torch-TensorRT with dynamo backend

    +
    Compiling Llama2 using the Torch-TensorRT with dynamo backend
    +
    +

    Engine Caching (BERT)

    +
    Engine Caching (BERT)

    Mutable Torch TensorRT Module

    Mutable Torch TensorRT Module
    @@ -492,6 +505,9 @@

    Deploy Quantized Models using Torch-TensorRT

    Deploy Quantized Models using Torch-TensorRT
    +
    +

    Engine Caching

    +
    Engine Caching

    Using Custom Kernels within TensorRT Engines with Torch-TensorRT

    Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    diff --git a/docs/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.html b/docs/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.html index b6d222ca05..a617a37fce 100644 --- a/docs/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.html +++ b/docs/tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example.html @@ -10,7 +10,7 @@ - Mutable Torch TensorRT Module — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + Mutable Torch TensorRT Module — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -275,7 +275,7 @@
    - v2.5.0.dev0+b3a8cdd + v2.5.0.dev0+1d0916f
    @@ -316,6 +316,9 @@
  • Deploying Torch-TensorRT Programs
  • DLA
  • Torch Compile Advanced Usage
  • +
  • Deploy Quantized Models using Torch-TensorRT
  • +
  • Engine Caching
  • +
  • Refitting Torch-TensorRT Programs with New Weights
  • Dynamo Frontend

      @@ -341,7 +344,6 @@
    • Example notebooks
    • Compiling ResNet using the Torch-TensorRT torch.compile Backend
    • Compiling a Transformer using torch.compile and TensorRT
    • -
    • Torch Compile Advanced Usage
    • Torch Compile Stable Diffusion
    • Torch Export with Cudagraphs
    • Using Custom Kernels within TensorRT Engines with Torch-TensorRT
    • diff --git a/docs/tutorials/_rendered_examples/dynamo/refit_engine_example.html b/docs/tutorials/_rendered_examples/dynamo/refit_engine_example.html index ad1b46f3bb..210e572b3b 100644 --- a/docs/tutorials/_rendered_examples/dynamo/refit_engine_example.html +++ b/docs/tutorials/_rendered_examples/dynamo/refit_engine_example.html @@ -10,7 +10,7 @@ - Refit TenorRT Graph Module with Torch-TensorRT — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + Refitting Torch-TensorRT Programs with New Weights — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -39,6 +39,8 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      + + + + + +
      +
      +
      + + + + + + + + + + + +
      +
      +
      + + + + + + + + + + + + + + + + +
      + +
        + +
      • + + + Docs + + > +
      • + + +
      • Compiling GPT2 using the Torch-TensorRT with dynamo backend
      • + + +
      • + + + + + +
      • + +
      + + +
      +
      + +
      + Shortcuts +
      +
      + +
      +
      + + + + + + +
      + +
      +
      + + +
      +

      Compiling GPT2 using the Torch-TensorRT with dynamo backend

      +

      This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a GPT2 model.

      +
      +

      Imports and Model Definition

      +
      import torch
      +import torch_tensorrt
      +from transformers import AutoModelForCausalLM, AutoTokenizer
      +from utils import export_llm, generate
      +
      +
      +
      # Define the parameters and initialize the model
      +MAX_TOKENS = 32
      +DEVICE = torch.device("cuda:0")
      +
      +# Define the GPT2 model from hugging face
      +# kv_cache is not supported in Torch-TRT currently.
      +# CPU is used here so that GPU memory is reserved for TRT compilation.
      +with torch.no_grad():
      +    tokenizer = AutoTokenizer.from_pretrained("gpt2")
      +    model = AutoModelForCausalLM.from_pretrained(
      +        "gpt2",
      +        pad_token_id=tokenizer.eos_token_id,
      +        use_cache=False,
      +        attn_implementation="eager",
      +    ).eval()
      +
      +
      +

      Tokenize a sample input prompt and get pytorch model outputs

      +
      prompt = "I enjoy walking with my cute dog"
      +model_inputs = tokenizer(prompt, return_tensors="pt")
      +input_ids = model_inputs["input_ids"]
      +
      +# Auto-regressive generation loop for greedy decoding using PyTorch model
      +# We use a custom generate function which is very similar to the huggingface one.
      +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
      +
      +
      +
      +
      +

      Compilation with Torch-TensorRT using dynamo backend and generate TensorRT outputs

      +
      # Export the GPT2 model into an ExportedProgram which is input of TRT compilation
      +gpt2_ep = export_llm(model, input_ids, max_seq_len=1024)
      +trt_model = torch_tensorrt.dynamo.compile(
      +    gpt2_ep,
      +    inputs=[input_ids],
      +    enabled_precisions={torch.float32},
      +    truncate_double=True,
      +    device=DEVICE,
      +    disable_tf32=True,
      +)
      +
      +# Auto-regressive generation loop for greedy decoding using TensorRT model
      +# We use a custom generate function which is very similar to the huggingface one.
      +# Move inputs to GPU
      +input_ids = input_ids.to(DEVICE)
      +trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
      +
      +
      +
      +
      +

      Decode the output sentences of PyTorch and TensorRT

      +
      print("=============================")
      +print(
      +    "Pytorch model generated text: ",
      +    tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
      +)
      +print("=============================")
      +print(
      +    "TensorRT model generated text: ",
      +    tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
      +)
      +
      +
      +
      +
      +
      +

      The output sentences should look like

      +
      +
      +

      Pytorch model generated text: I enjoy walking with my cute dog, but I’m not sure if I’ll ever be able to walk with my dog. I’m not sure if I’ll ever be able to walk with my

      +

      TensorRT model generated text: I enjoy walking with my cute dog, but I’m not sure if I’ll ever be able to walk with my dog. I’m not sure if I’ll ever be able to walk with my

      +

      Total running time of the script: ( 0 minutes 0.000 seconds)

      + +

      Gallery generated by Sphinx-Gallery

      +
      + + +
      + +
      +
      + + + + +
      + + + +
      +

      + © Copyright 2024, NVIDIA Corporation. + +

      +
      + +
      + Built with Sphinx using a theme provided by Read the Docs. +
      + + +
      + +
      +
      + + +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +

      Docs

      +

      Access comprehensive developer documentation for PyTorch

      + View Docs +
      + +
      +

      Tutorials

      +

      Get in-depth tutorials for beginners and advanced developers

      + View Tutorials +
      + +
      +

      Resources

      +

      Find development resources and get your questions answered

      + View Resources +
      +
      +
      +
      + + + + + + + + + +
      +
      +
      +
      + + +
      +
      +
      + + +
      + + + + + + + + \ No newline at end of file diff --git a/docs/tutorials/_rendered_examples/dynamo/torch_export_llama2.html b/docs/tutorials/_rendered_examples/dynamo/torch_export_llama2.html new file mode 100644 index 0000000000..317e22851f --- /dev/null +++ b/docs/tutorials/_rendered_examples/dynamo/torch_export_llama2.html @@ -0,0 +1,893 @@ + + + + + + + + + + + + + Compiling Llama2 using the Torch-TensorRT with dynamo backend — Torch-TensorRT v2.5.0.dev0+1d0916f documentation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      + + + + + +
      +
      +
      + + + + + + + + + + + +
      +
      +
      + + + + + + + + + + + + + + + + +
      + +
        + +
      • + + + Docs + + > +
      • + + +
      • Compiling Llama2 using the Torch-TensorRT with dynamo backend
      • + + +
      • + + + + + +
      • + +
      + + +
      +
      + +
      + Shortcuts +
      +
      + +
      +
      + + + + + + +
      + +
      +
      + + +
      +

      Compiling Llama2 using the Torch-TensorRT with dynamo backend

      +

      This interactive script is intended as a sample of the Torch-TensorRT workflow with dynamo backend on a Llama2 model.

      +
      +

      Imports and Model Definition

      +
      import torch
      +import torch_tensorrt
      +from transformers import AutoModelForCausalLM, AutoTokenizer
      +from utils import export_llm, generate
      +
      +
      +

      Define the parameters and initialize the model

      +
      MAX_TOKENS = 32
      +DEVICE = torch.device("cuda:0")
      +
      +# Define the Llama2 model from hugging face
      +# kv_cache is not supported in Torch-TRT currently.
      +# CPU is used here so that GPU memory is reserved for TRT compilation.
      +llama_path = "meta-llama/Llama-2-7b-chat-hf"
      +with torch.no_grad():
      +    model = AutoModelForCausalLM.from_pretrained(
      +        llama_path, use_cache=False, attn_implementation="eager"
      +    ).eval()
      +
      +tokenizer = AutoTokenizer.from_pretrained(llama_path)
      +
      +
      +

      Tokenize a sample input prompt and get pytorch model outputs

      +
      prompt = "What is dynamic programming?"
      +model_inputs = tokenizer(prompt, return_tensors="pt")
      +input_ids = model_inputs.input_ids
      +
      +# Auto-regressive generation loop for greedy decoding using PyTorch model
      +# We use a custom generate function which is very similar to the huggingface one.
      +pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
      +
      +
      +
      +
      +

      Compilation with Torch-TensorRT using dynamo backend and generate TensorRT outputs

      +
      # Export the llama2 model into an ExportedProgram which is input of TRT compilation
      +llama2_ep = export_llm(model, input_ids, max_seq_len=64)
      +trt_model = torch_tensorrt.dynamo.compile(
      +    llama2_ep,
      +    inputs=[input_ids],
      +    enabled_precisions={torch.float32},
      +    min_block_size=1,
      +    truncate_double=True,
      +    device=DEVICE,
      +    disable_tf32=True,
      +)
      +
      +# Auto-regressive generation loop for greedy decoding using TensorRT model
      +# We use a custom generate function which is very similar to the huggingface one.
      +# Move inputs to GPU
      +input_ids = input_ids.to(DEVICE)
      +trt_gen_tokens = generate(trt_model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
      +
      +
      +
      +
      +

      Decode the output sentences of PyTorch and TensorRT

      +
      print("=============================")
      +print(
      +    "Pytorch model generated text: ",
      +    tokenizer.batch_decode(
      +        pyt_gen_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False
      +    )[0],
      +)
      +print("=============================")
      +print(
      +    "TensorRT model generated text: ",
      +    tokenizer.batch_decode(
      +        trt_gen_tokens,
      +        skip_special_tokens=True,
      +        clean_up_tokenization_spaces=False,
      +    )[0],
      +)
      +
      +
      +
      +
      +
      +

      The output sentences should look like

      +
      +
      +

      Pytorch model generated text: I enjoy walking with my cute dog, but I’m not sure if I’ll ever be able to walk with my dog. I’m not sure if I’ll ever be able to walk with my

      +

      TensorRT model generated text: I enjoy walking with my cute dog, but I’m not sure if I’ll ever be able to walk with my dog. I’m not sure if I’ll ever be able to walk with my

      +

      Total running time of the script: ( 0 minutes 0.000 seconds)

      + +

      Gallery generated by Sphinx-Gallery

      +
      + + +
      + +
      +
      + + + + +
      + + + +
      +

      + © Copyright 2024, NVIDIA Corporation. + +

      +
      + +
      + Built with Sphinx using a theme provided by Read the Docs. +
      + + +
      + +
      +
      + + +
      +
      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +
      +
      +
      +

      Docs

      +

      Access comprehensive developer documentation for PyTorch

      + View Docs +
      + +
      +

      Tutorials

      +

      Get in-depth tutorials for beginners and advanced developers

      + View Tutorials +
      + +
      +

      Resources

      +

      Find development resources and get your questions answered

      + View Resources +
      +
      +
      +
      + + + + + + + + + +
      +
      +
      +
      + + +
      +
      +
      + + +
      + + + + + + + + \ No newline at end of file diff --git a/docs/tutorials/_rendered_examples/dynamo/vgg16_ptq.html b/docs/tutorials/_rendered_examples/dynamo/vgg16_ptq.html index c39e454736..fc6814ef94 100644 --- a/docs/tutorials/_rendered_examples/dynamo/vgg16_ptq.html +++ b/docs/tutorials/_rendered_examples/dynamo/vgg16_ptq.html @@ -10,7 +10,7 @@ - Deploy Quantized Models using Torch-TensorRT — Torch-TensorRT v2.5.0.dev0+b3a8cdd documentation + Deploy Quantized Models using Torch-TensorRT — Torch-TensorRT v2.5.0.dev0+1d0916f documentation @@ -39,6 +39,8 @@ + +