Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: example fixes #3176

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions docker/Dockerfile.lab
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# syntax=docker/dockerfile:1

# Base image starts with CUDA
ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
FROM ${BASE_IMG} as base
ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04

ARG PYTHON_VERSION=3.10
ENV PYTHON_VERSION=${PYTHON_VERSION}

ARG USE_CXX11_ABI
ENV USE_CXX11=${USE_CXX11_ABI}
ENV DEBIAN_FRONTEND=noninteractive

# Install basic dependencies
RUN apt-get update
RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8

# Install PyEnv and desired Python version
ENV HOME="/root"
ENV PYENV_DIR="$HOME/.pyenv"
ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH"
RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\
chmod 755 pyenv-installer &&\
bash pyenv-installer &&\
eval "$(pyenv init -)"

RUN pyenv install -v ${PYTHON_VERSION}
RUN pyenv global ${PYTHON_VERSION}

# Setup Bazel via Bazelisk
RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\
chmod a+x /usr/bin/bazel
2 changes: 1 addition & 1 deletion examples/dynamo/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cupy==13.1.0
triton==2.3.0
diffusers==0.30.3
transformers==4.44.2
transformers==4.44.2
100 changes: 100 additions & 0 deletions examples/dynamo/torch_compile_gpt2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""
.. _torch_compile_gpt2:

Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
==========================================================

This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""

# %%
# Imports and Model Definition
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
import torch
import torch_tensorrt
from transformers import AutoModelForCausalLM, AutoTokenizer

# %%

# Define the parameters
MAX_TOKENS = 32
DEVICE = torch.device("cuda:0")

# Define the GPT2 model from hugging face
# kv_cache is not supported in Torch-TRT currently.
# CPU is used here so that GPU memory is reserved for TRT compilation.
with torch.no_grad():
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = (
AutoModelForCausalLM.from_pretrained(
"gpt2",
pad_token_id=tokenizer.eos_token_id,
use_cache=False,
attn_implementation="eager",
)
.eval()
.cuda()
)

# %%
# Tokenize a sample input prompt and get pytorch model outputs
prompt = "I enjoy walking with my cute dog"
model_inputs = tokenizer(prompt, return_tensors="pt")
input_ids = model_inputs["input_ids"].cuda()

# Auto-regressive generation loop for greedy search using PyTorch model.
pyt_gen_tokens = model.generate(
input_ids,
max_length=MAX_TOKENS,
use_cache=False,
pad_token_id=tokenizer.eos_token_id,
)

# %%
# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# Compile the model and mark the input sequence length to be dynamic
torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
model.forward = torch.compile(
model.forward,
backend="tensorrt",
dynamic=None,
options={
"enabled_precisions": {torch.float32},
"disable_tf32": True,
"min_block_size": 1,
"debug": True,
},
)

# Auto-regressive generation loop for greedy decoding using TensorRT model
# The first token generation compiles the model using TensorRT and the second token
# encounters recompilation
trt_gen_tokens = model.generate(
inputs=input_ids,
max_length=MAX_TOKENS,
use_cache=False,
pad_token_id=tokenizer.eos_token_id,
)

# %%
# Decode the output sentences of PyTorch and TensorRT
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

print("=============================")
print(
"Pytorch model generated text: ",
tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
)
print("=============================")
print(
"TensorRT model generated text: ",
tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
)

# %%
# The output sentences should look like

# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
# =============================
# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
89 changes: 89 additions & 0 deletions examples/dynamo/torch_compile_llama2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
"""
.. _torch_compile_gpt2:

Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
==========================================================

This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""

# %%
# Imports and Model Definition
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
import torch
import torch_tensorrt
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import generate

# %%

# Define the parameters
MAX_TOKENS = 32
DEVICE = torch.device("cuda:0")

# Define the GPT2 model from hugging face
# kv_cache is not supported in Torch-TRT currently.
# CPU is used here so that GPU memory is reserved for TRT compilation.
llama_path = "meta-llama/Llama-2-7b-chat-hf"
with torch.no_grad():
model = AutoModelForCausalLM.from_pretrained(
llama_path, use_cache=False, attn_implementation="eager"
).eval()

tokenizer = AutoTokenizer.from_pretrained(llama_path)

# %%
# Tokenize a sample input prompt and get pytorch model outputs
prompt = "I enjoy walking with my cute dog"
model_inputs = tokenizer(prompt, return_tensors="pt")
input_ids = model_inputs["input_ids"].cuda()

# Auto-regressive generation loop for greedy search using PyTorch model.
# We use a custom generate function which is very similar to the huggingface one.
# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)

# %%
# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# Compile the model and mark the input sequence length to be dynamic
with torch_tensorrt.logging.debug():
torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023)
model.forward = torch.compile(
model.forward,
backend="tensorrt",
dynamic=None,
options={
"enabled_precisions": {torch.float32},
"disable_tf32": True,
"debug": True,
# "use_python_runtime": True
},
)
model(input_ids)
breakpoint()
model(input_ids)
# Auto-regressive generation loop for greedy decoding using TensorRT model
# We use a custom generate function which is very similar to the huggingface one.
# Move inputs to GPU
input_ids = input_ids.to(DEVICE)
trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)

# %%
# Decode the output sentences of PyTorch and TensorRT
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

print("=============================")
print(
"Pytorch model generated text: ",
tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
)
print("=============================")
print(
"TensorRT model generated text: ",
tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
)

# %%
# The output sentences should look like
#
#
9 changes: 8 additions & 1 deletion examples/dynamo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,14 @@ def generate(model, input_seq, max_tokens, eos_token_id):
)

while True:
outputs = model(input_seq)
outputs = model(
input_seq,
past_key_values=None,
position_ids=None,
attention_mask=None,
use_cache=False,
token_type_ids=None,
)
logits = outputs.logits
next_token_logits = logits[:, -1, :]
next_tokens = torch.argmax(next_token_logits, dim=-1)
Expand Down
2 changes: 1 addition & 1 deletion py/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ packaging
pybind11==2.6.2
--extra-index-url https://download.pytorch.org/whl/nightly/cu124
torch>=2.6.0.dev,<2.7.0
torchvision>=0.20.0.dev,<0.21.0
#torchvision>=0.20.0.dev,<0.21.0
--extra-index-url https://pypi.ngc.nvidia.com
pyyaml
1 change: 1 addition & 0 deletions py/torch_tensorrt/dynamo/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,7 @@ def compile(
trt_gm = compile_module(
gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
)

return trt_gm


Expand Down
5 changes: 3 additions & 2 deletions py/torch_tensorrt/dynamo/backend/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def _pretraced_backend(
repair_input_aliasing(gm, settings)

# Remove sym_int placeholders and inputs
remove_sym_nodes(gm, settings)
remove_sym_nodes(gm, sample_inputs, settings)

torch_inputs = [
input for input in sample_inputs if isinstance(input, torch.Tensor)
]
Expand All @@ -91,7 +92,7 @@ def _pretraced_backend(
# Invoke AOTAutograd to translate operators to aten
gm = aot_export_joint_simple(
gm,
torch_inputs,
sample_inputs,
trace_joint=False,
decompositions=get_decompositions(
settings.enable_experimental_decompositions
Expand Down
14 changes: 9 additions & 5 deletions py/torch_tensorrt/dynamo/lowering/passes/remove_sym_nodes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
from typing import Any, Sequence

import torch
from torch_tensorrt.dynamo._settings import CompilationSettings
Expand All @@ -7,15 +8,17 @@


def remove_sym_nodes(
gm: torch.fx.GraphModule, settings: CompilationSettings
gm: torch.fx.GraphModule,
sample_inputs: Sequence[Any],
settings: CompilationSettings,
) -> torch.fx.GraphModule:
"""Remove sym_int placeholders which get inserted due to torch.compile's
dynamic=True behavior
"""
# Extract SymInt placeholder Tensors
placeholder_sym_ints = [
node
for node in gm.graph.nodes
placeholder_idx_sym_ints = [
(idx, node)
for idx, node in enumerate(gm.graph.nodes)
if (
node.op == "placeholder"
and isinstance(node.type, type)
Expand All @@ -24,8 +27,9 @@ def remove_sym_nodes(
)
]

for node in placeholder_sym_ints:
for idx, node in placeholder_idx_sym_ints:
gm.graph.erase_node(node)
sample_inputs.pop(idx)

gm.graph.lint()
gm.recompile()
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ requires = [
"cffi>=1.15.1",
"typing-extensions>=4.7.0",
"future>=0.18.3",
"tensorrt-cu12==10.3.0",
#"tensorrt-cu12==10.3.0",
"torch>=2.6.0.dev,<2.7.0",
"pybind11==2.6.2",
"numpy",
Expand Down Expand Up @@ -55,9 +55,9 @@ keywords = [
]
dependencies = [
"torch>=2.6.0.dev,<2.7.0",
"tensorrt-cu12==10.3.0",
"tensorrt-cu12-bindings==10.3.0",
"tensorrt-cu12-libs==10.3.0",
#"tensorrt-cu12==10.3.0",
#"tensorrt-cu12-bindings==10.3.0",
#"tensorrt-cu12-libs==10.3.0",
"packaging>=23",
"numpy",
"typing-extensions>=4.7.0",
Expand Down
Loading