Skip to content

Commit

Permalink
chore: bug fixes for full and expand (#3019)
Browse files Browse the repository at this point in the history
  • Loading branch information
peri044 authored Jul 31, 2024
1 parent c99c966 commit 4476792
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 33 deletions.
13 changes: 7 additions & 6 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
// Whether cudagraphs needs to record the graph on this pass
bool need_cudagraphs_record = (CUDAGRAPHS_MODE && !_cudagraphs_validate_shapes(inputs, compiled_engine));

// this is a buffer to store shape tensor input addresses throughout the runtime scope
std::list<std::vector<int32_t>> inputShapeTensorValues;

// Intialize outputs to be available throughout the succeeding scopes
std::vector<at::Tensor> outputs(compiled_engine->num_io.second);

Expand Down Expand Up @@ -177,8 +180,6 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
}
}

// this is a buffer to store shape tensor input addresses throughout the runtime scope
std::list<std::vector<int32_t>> inputShapeTensorValues;
{
std::unique_ptr<torch::autograd::profiler::RecordProfile> input_profiler_guard;
if (compiled_engine->profile_execution) {
Expand All @@ -200,12 +201,12 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
at::Tensor contig_input;

if (compiled_engine->cuda_engine->isShapeInferenceIO(name.c_str())) {
// Shape tensor inputs are casted to int32 explicitly.
// Shape tensor inputs are casted to int64 explicitly.
// Refer to
// https://github.com/NVIDIA/TensorRT/blob/d2f4ef789a9a6ffdf37b55c3f81b486225f6b380/samples/common/sampleInference.cpp#L435
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt32);
std::vector<int32_t> inputs_cpu_vec(
input_cpu.data_ptr<int32_t>(), input_cpu.data_ptr<int32_t>() + input_cpu.numel());
auto input_cpu = inputs[i].clone().contiguous().cpu().to(torch::kInt64);
std::vector<int64_t> inputs_cpu_vec(
input_cpu.data_ptr<int64_t>(), input_cpu.data_ptr<int64_t>() + input_cpu.numel());
inputShapeTensorValues.emplace_back(inputs_cpu_vec);
TORCHTRT_CHECK(
compiled_engine->exec_ctx->setTensorAddress(name.c_str(), inputShapeTensorValues.back().data()),
Expand Down
8 changes: 7 additions & 1 deletion py/torch_tensorrt/dynamo/conversion/impl/full.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ def full(
) -> TRTTensor:
# in static shape scenario, shape is a list of int
if isinstance(shape, List):
return np.full(shape, fill_value)
# in static shape scenario, shape is a list of int
if all(isinstance(dim, int) for dim in shape):
return np.full(shape, fill_value)
else:
shape = impl.cat.cat(
ctx, target, source_ir, name + "_concat_shape", shape, 0
)

# in dynamic shape scenario, shape is a shap tensor
# use IFillLayer to fill the shape tensor with LINSPACE value
Expand Down
57 changes: 35 additions & 22 deletions py/torch_tensorrt/dynamo/conversion/impl/slice/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ def expand(
) -> TRTTensor:
shape_rank = len(shape)
initial_tensor_rank = len(input_t.shape)

# If the rank of the input tensor is less than the shape's rank, pad with ones
if initial_tensor_rank < shape_rank:
input_t = prepend_ones(
Expand All @@ -244,39 +245,49 @@ def expand(
# After the above padding, the shape and tensor rank must be equal
assert len(input_t.shape) == shape_rank

shape_t = []
for i in range(shape_rank):
if shape[i] == -1:
shape_t.append(
get_shape(ctx, target, source_ir, name + f"_shape_dim{i}", input_t, i)
)
else:
shape_t.append(shape[i])

# Establish the desired output shape, strides, and starting indices
input_tensor_shape = tuple(input_t.shape)
# Configure the start, strides and output shape tensors
start = tuple([0] * shape_rank)

# TODO: Revisit stride calculation. stride[dim]=0 implies that dimension is being broadcasted.
# stride[dim]=0 implies that dimension is being broadcasted.
# stride should be 1 for all non-broadcasted dims
stride = []
for i, o in zip(input_tensor_shape, shape_t):
# If the shape has ITensor, we treat it as a reshape dim instead of a broadcasted dim
# shape_t cannot have -1. If the input at this dimension has a shape of -1, set the stride to 1. This indicates that the input is dynamic and does not imply broadcasting at that specific dimension.
if isinstance(i, int) and isinstance(o, int) and i != DYNAMIC_DIM:
input_tensor_shape = tuple(input_t.shape)
for i, o in zip(input_tensor_shape, shape):
# If input dim and target shape dim are static, broadcast if they are not equal
# If input dim is known and target shape dim is dynamic we treat it as a broadcasted dim
if (
isinstance(i, int)
and i != DYNAMIC_DIM
and isinstance(o, int)
and o != DYNAMIC_DIM
):
stride.append(int(i == o))
elif isinstance(i, int) and i != DYNAMIC_DIM and isinstance(o, TRTTensor):
stride.append(0)
else:
# No broadcasting is happening. The output should have the same size as input at this dimension.
stride.append(1)

shape_ = shape_t
# Resolve dynamic dimensions in the target shape. These are not broadcasted dims.
# The value at this dimension should be same as input.
target_shape = []
for i in range(shape_rank):
if shape[i] == DYNAMIC_DIM:
target_shape.append(
get_shape(ctx, target, source_ir, name + f"_shape_dim{i}", input_t, i)
)
else:
target_shape.append(shape[i])

target_shape_t = target_shape
# Handle dynamic shapes case where shape has dynamic dimension
if any(isinstance(ele, TRTTensor) for ele in shape_t):
shape_ = cat(
if any(isinstance(ele, TRTTensor) for ele in target_shape_t):
target_shape_t = cat(
ctx,
target,
source_ir,
name + "_shape_concat",
shape_t,
target_shape_t,
0,
cast_dtype=trt.int32,
)
Expand All @@ -302,10 +313,12 @@ def expand(
input_t, start=trt.Dims(), shape=trt.Dims(), stride=trt.Dims()
)
layer.set_input(1, start_tensor)
layer.set_input(2, shape_)
layer.set_input(2, target_shape_t)
layer.set_input(3, stride_tensor)
else:
layer = ctx.net.add_slice(input_t, start=start, shape=shape_, stride=stride)
layer = ctx.net.add_slice(
input_t, start=start, shape=target_shape_t, stride=stride
)

set_layer_name(layer, target, name, source_ir)
return layer.get_output(0)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import logging
import operator
from typing import Callable, Sequence, Tuple
Expand Down Expand Up @@ -54,6 +55,13 @@ def lower_scaled_dot_product_attention(
== torch.nn.functional.scaled_dot_product_attention
)

# Copy the metadata of the replaced attention node to the new node
# TODO: Investigate why there are multiple FakeTensors in the metadata.
# We only use the first one as it contains the output shape information for this node.
new_attention_node.meta["val"] = copy.copy(
attention_node_replaced.meta["val"][0]
)

# If the attention operator had keyword-args, copy them to the new node
if attention_node_replaced.kwargs:
new_attention_node.kwargs = {**attention_node_replaced.kwargs}
Expand Down
9 changes: 8 additions & 1 deletion tests/py/dynamo/conversion/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,14 @@ def run_test_with_dynamic_shape(
)
# Since the lowering is based on optimal shape. We need to test with
# different shape(for ex. max shape) for testing dynamic shape
inputs_max = [spec.example_tensor("max_shape") for spec in input_specs]
inputs_max = [
(
spec.example_tensor("max_shape")
if spec.shape_mode == Input._ShapeMode.DYNAMIC
else spec.example_tensor()
)
for spec in input_specs
]
if not use_example_tensors:
inputs_max = [spec.torch_tensor for spec in input_specs]
super().run_test(mod, inputs_max, interp, rtol, atol, pyt_inputs=pyt_inputs)
32 changes: 29 additions & 3 deletions tests/py/dynamo/conversion/test_expand_aten.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ def forward(self, x):
("different_ranks", (1, 2, 1), (1, 2, 1), (2, 2, 1), (2, -1, -1, -1)),
]
)
def test_expand_dynamic(self, _, min_shape, opt_shape, max_shape, expanded_shape):
class ExpandDynamic(nn.Module):
def test_expand_dynamic_input(
self, _, min_shape, opt_shape, max_shape, expanded_shape
):
class ExpandInputDynamic(nn.Module):
def forward(self, x):
return torch.ops.aten.expand.default(x, expanded_shape)

Expand All @@ -51,10 +53,34 @@ def forward(self, x):
),
]
self.run_test_with_dynamic_shape(
ExpandDynamic(),
ExpandInputDynamic(),
input_specs,
)

@parameterized.expand(
[
("3d_dim", (4, 1, 768), (1, 1, 768)),
]
)
def test_expand_dynamic_target_shape(self, _, input_shape, weight_shape):
class ExpandTargetDynamic(torch.nn.Module):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.cls_token = torch.nn.Parameter(torch.randn(weight_shape).cuda())

def forward(self, x):
batch_size = x.shape[0]
cls_tokens = self.cls_token.expand(batch_size, -1, -1)
embeddings = torch.cat((cls_tokens, x), dim=0)
return embeddings

input_specs = [
Input(dtype=torch.float32, shape=input_shape),
]
self.run_test_with_dynamic_shape(
ExpandTargetDynamic(), input_specs, use_dynamo_tracer=True
)


if __name__ == "__main__":
run_tests()
26 changes: 26 additions & 0 deletions tests/py/dynamo/conversion/test_full_aten.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,32 @@ def forward(self, shape):
use_example_tensors=False,
)

@parameterized.expand(
[
((1, 5, 3), (3, 7, 3), (4, 10, 4), 0.11),
]
)
def test_full_dynamic_shape_list(self, min_shape, opt_shape, max_shape, fill_value):
class full(nn.Module):
def forward(self, x):
shape = x.shape[0]
target_shape = (shape, shape + 1)
return torch.ops.aten.full.default(target_shape, fill_value)

inputs = [
torch_tensorrt.Input(
min_shape=min_shape,
opt_shape=opt_shape,
max_shape=max_shape,
dtype=torch.int64,
)
]
self.run_test_with_dynamic_shape(
full(),
inputs,
use_dynamo_tracer=True,
)


if __name__ == "__main__":
run_tests()

0 comments on commit 4476792

Please sign in to comment.