diff --git a/docker/Dockerfile.lab b/docker/Dockerfile.lab new file mode 100644 index 0000000000..569acfecad --- /dev/null +++ b/docker/Dockerfile.lab @@ -0,0 +1,33 @@ +# syntax=docker/dockerfile:1 + +# Base image starts with CUDA +ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04 +FROM ${BASE_IMG} as base +ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04 + +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} + +ARG USE_CXX11_ABI +ENV USE_CXX11=${USE_CXX11_ABI} +ENV DEBIAN_FRONTEND=noninteractive + +# Install basic dependencies +RUN apt-get update +RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8 + +# Install PyEnv and desired Python version +ENV HOME="/root" +ENV PYENV_DIR="$HOME/.pyenv" +ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH" +RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\ + chmod 755 pyenv-installer &&\ + bash pyenv-installer &&\ + eval "$(pyenv init -)" + +RUN pyenv install -v ${PYTHON_VERSION} +RUN pyenv global ${PYTHON_VERSION} + +# Setup Bazel via Bazelisk +RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\ + chmod a+x /usr/bin/bazel diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt index 59a802918c..41fe29f09c 100644 --- a/examples/dynamo/requirements.txt +++ b/examples/dynamo/requirements.txt @@ -1,4 +1,4 @@ cupy==13.1.0 triton==2.3.0 diffusers==0.30.3 -transformers==4.44.2 \ No newline at end of file +transformers==4.44.2 diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py new file mode 100644 index 0000000000..6c6e1b03a2 --- /dev/null +++ b/examples/dynamo/torch_compile_gpt2.py @@ -0,0 +1,100 @@ +""" +.. _torch_compile_gpt2: + +Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer + +# %% + +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +with torch.no_grad(): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", + ) + .eval() + .cuda() + ) + +# %% +# Tokenize a sample input prompt and get pytorch model outputs +prompt = "I enjoy walking with my cute dog" +model_inputs = tokenizer(prompt, return_tensors="pt") +input_ids = model_inputs["input_ids"].cuda() + +# Auto-regressive generation loop for greedy search using PyTorch model. +pyt_gen_tokens = model.generate( + input_ids, + max_length=MAX_TOKENS, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, +) + +# %% +# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Compile the model and mark the input sequence length to be dynamic +torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) +model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float32}, + "disable_tf32": True, + "min_block_size": 1, + "debug": True, + }, +) + +# Auto-regressive generation loop for greedy decoding using TensorRT model +# The first token generation compiles the model using TensorRT and the second token +# encounters recompilation +trt_gen_tokens = model.generate( + inputs=input_ids, + max_length=MAX_TOKENS, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, +) + +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), +) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), +) + +# %% +# The output sentences should look like + +# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll +# ============================= +# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll diff --git a/examples/dynamo/torch_compile_llama2.py b/examples/dynamo/torch_compile_llama2.py new file mode 100644 index 0000000000..40ddc97d2c --- /dev/null +++ b/examples/dynamo/torch_compile_llama2.py @@ -0,0 +1,89 @@ +""" +.. _torch_compile_gpt2: + +Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import generate + +# %% + +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +llama_path = "meta-llama/Llama-2-7b-chat-hf" +with torch.no_grad(): + model = AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ).eval() + +tokenizer = AutoTokenizer.from_pretrained(llama_path) + +# %% +# Tokenize a sample input prompt and get pytorch model outputs +prompt = "I enjoy walking with my cute dog" +model_inputs = tokenizer(prompt, return_tensors="pt") +input_ids = model_inputs["input_ids"].cuda() + +# Auto-regressive generation loop for greedy search using PyTorch model. +# We use a custom generate function which is very similar to the huggingface one. +# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Compile the model and mark the input sequence length to be dynamic +with torch_tensorrt.logging.debug(): + torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) + model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float32}, + "disable_tf32": True, + "debug": True, + # "use_python_runtime": True + }, + ) +model(input_ids) +breakpoint() +model(input_ids) +# Auto-regressive generation loop for greedy decoding using TensorRT model +# We use a custom generate function which is very similar to the huggingface one. +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), +) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), +) + +# %% +# The output sentences should look like +# +# diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py index 25ad99c12d..90f1f3b72c 100644 --- a/examples/dynamo/utils.py +++ b/examples/dynamo/utils.py @@ -51,7 +51,14 @@ def generate(model, input_seq, max_tokens, eos_token_id): ) while True: - outputs = model(input_seq) + outputs = model( + input_seq, + past_key_values=None, + position_ids=None, + attention_mask=None, + use_cache=False, + token_type_ids=None, + ) logits = outputs.logits next_token_logits = logits[:, -1, :] next_tokens = torch.argmax(next_token_logits, dim=-1) diff --git a/py/requirements.txt b/py/requirements.txt index 361afab365..d480ccbd57 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -3,6 +3,6 @@ packaging pybind11==2.6.2 --extra-index-url https://download.pytorch.org/whl/nightly/cu124 torch>=2.6.0.dev,<2.7.0 -torchvision>=0.20.0.dev,<0.21.0 +#torchvision>=0.20.0.dev,<0.21.0 --extra-index-url https://pypi.ngc.nvidia.com pyyaml diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 9859668cd9..d7792e7464 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -608,6 +608,7 @@ def compile( trt_gm = compile_module( gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache ) + return trt_gm diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index e15ed0495f..f60cdf3fca 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -80,7 +80,8 @@ def _pretraced_backend( repair_input_aliasing(gm, settings) # Remove sym_int placeholders and inputs - remove_sym_nodes(gm, settings) + remove_sym_nodes(gm, sample_inputs, settings) + torch_inputs = [ input for input in sample_inputs if isinstance(input, torch.Tensor) ] @@ -91,7 +92,7 @@ def _pretraced_backend( # Invoke AOTAutograd to translate operators to aten gm = aot_export_joint_simple( gm, - torch_inputs, + sample_inputs, trace_joint=False, decompositions=get_decompositions( settings.enable_experimental_decompositions diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py index 2605dccba6..9f69572059 100644 --- a/py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py +++ b/py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py @@ -1,4 +1,5 @@ import logging +from typing import Any, Sequence import torch from torch_tensorrt.dynamo._settings import CompilationSettings @@ -7,15 +8,17 @@ def remove_sym_nodes( - gm: torch.fx.GraphModule, settings: CompilationSettings + gm: torch.fx.GraphModule, + sample_inputs: Sequence[Any], + settings: CompilationSettings, ) -> torch.fx.GraphModule: """Remove sym_int placeholders which get inserted due to torch.compile's dynamic=True behavior """ # Extract SymInt placeholder Tensors - placeholder_sym_ints = [ - node - for node in gm.graph.nodes + placeholder_idx_sym_ints = [ + (idx, node) + for idx, node in enumerate(gm.graph.nodes) if ( node.op == "placeholder" and isinstance(node.type, type) @@ -24,8 +27,9 @@ def remove_sym_nodes( ) ] - for node in placeholder_sym_ints: + for idx, node in placeholder_idx_sym_ints: gm.graph.erase_node(node) + sample_inputs.pop(idx) gm.graph.lint() gm.recompile() diff --git a/pyproject.toml b/pyproject.toml index 1284e458f4..1ec43f76fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ requires = [ "cffi>=1.15.1", "typing-extensions>=4.7.0", "future>=0.18.3", - "tensorrt-cu12==10.3.0", + #"tensorrt-cu12==10.3.0", "torch>=2.6.0.dev,<2.7.0", "pybind11==2.6.2", "numpy", @@ -55,9 +55,9 @@ keywords = [ ] dependencies = [ "torch>=2.6.0.dev,<2.7.0", - "tensorrt-cu12==10.3.0", - "tensorrt-cu12-bindings==10.3.0", - "tensorrt-cu12-libs==10.3.0", + #"tensorrt-cu12==10.3.0", + #"tensorrt-cu12-bindings==10.3.0", + #"tensorrt-cu12-libs==10.3.0", "packaging>=23", "numpy", "typing-extensions>=4.7.0",