Skip to content

Commit

Permalink
Fix optional used with GPUExecutionProvider
Browse files Browse the repository at this point in the history
  • Loading branch information
amancini-N committed Jan 26, 2024
1 parent 28a16c2 commit 10cb01d
Show file tree
Hide file tree
Showing 8 changed files with 189 additions and 77 deletions.
8 changes: 8 additions & 0 deletions include/onnxruntime/core/framework/data_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,10 @@ class DataTypeImpl {
static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv4();
static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv9();

static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypes(); // up to IR4 (no float 8), deprecated
static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4();
static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9();

static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypes(); // up to IR4 (no float 8), deprecated
static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv4();
static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv9();
Expand All @@ -242,6 +246,10 @@ class DataTypeImpl {
static const std::vector<MLDataType>& AllOptionalTypesIRv4();
static const std::vector<MLDataType>& AllOptionalTypesIRv9();

static const std::vector<MLDataType>& AllFixedSizeOptionalTypes(); // up to IR4 (no float 8), deprecated
static const std::vector<MLDataType>& AllFixedSizeOptionalTypesIRv4();
static const std::vector<MLDataType>& AllFixedSizeOptionalTypesIRv9();

static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypes(); // up to IR4 (no float 8), deprecated
static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypesIRv4();
static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypesIRv9();
Expand Down
60 changes: 60 additions & 0 deletions onnxruntime/core/framework/data_types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1397,6 +1397,38 @@ const std::vector<MLDataType>& DataTypeImpl::AllOptionalAndTensorAndSequenceTens
return all_optional_and_tensor_and_sequence_types;
}

const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypes() {
return AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4();
}

const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4() {
static std::vector<MLDataType> all_optional_and_tensor_and_sequence_types =
[]() {
auto temp = AllFixedSizeOptionalTypesIRv4();
const auto tensor = AllFixedSizeTensorTypesIRv4();
temp.insert(temp.end(), tensor.begin(), tensor.end());
const auto& seq = AllFixedSizeSequenceTensorTypesIRv4();
temp.insert(temp.end(), seq.begin(), seq.end());
return temp;
}();

return all_optional_and_tensor_and_sequence_types;
}

const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9() {
static std::vector<MLDataType> all_optional_and_tensor_and_sequence_types =
[]() {
auto temp = AllFixedSizeOptionalTypesIRv9();
const auto tensor = AllFixedSizeTensorTypesIRv9();
temp.insert(temp.end(), tensor.begin(), tensor.end());
const auto& seq = AllFixedSizeSequenceTensorTypesIRv9();
temp.insert(temp.end(), seq.begin(), seq.end());
return temp;
}();

return all_optional_and_tensor_and_sequence_types;
}

const std::vector<MLDataType>& DataTypeImpl::AllOptionalTypes() {
return AllOptionalTypesIRv4();
}
Expand Down Expand Up @@ -1425,6 +1457,34 @@ const std::vector<MLDataType>& DataTypeImpl::AllOptionalTypesIRv9() {
return all_optional_types;
}

const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalTypes() {
return AllFixedSizeOptionalTypesIRv4();
}

const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalTypesIRv4() {
static std::vector<MLDataType> all_optional_types =
[]() {
auto temp = GetOptionalTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv4>();
const auto& seq = GetOptionalSequenceTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv4>();
temp.insert(temp.end(), seq.begin(), seq.end());
return temp;
}();

return all_optional_types;
}

const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalTypesIRv9() {
static std::vector<MLDataType> all_optional_types =
[]() {
auto temp = GetOptionalTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv9>();
const auto& seq = GetOptionalSequenceTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv9>();
temp.insert(temp.end(), seq.begin(), seq.end());
return temp;
}();

return all_optional_types;
}

const std::vector<MLDataType>& DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypes() {
return AllTensorAndSequenceTensorAndOptionalTypesIRv4();
}
Expand Down
48 changes: 29 additions & 19 deletions onnxruntime/core/framework/utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -81,33 +81,41 @@ static common::Status AllocateHelper(const AllocatorPtr& allocator,
}

if (source_mlvalue.IsTensor()) {
const Tensor& source_tensor = source_mlvalue.Get<Tensor>();
if (allocator->Info().alloc_type == OrtArenaAllocator) {
void* p_data = nullptr;
if (source_mlvalue.IsAllocated()) {
const Tensor& source_tensor = source_mlvalue.Get<Tensor>();
if (allocator->Info().alloc_type == OrtArenaAllocator) {
void* p_data = nullptr;
#ifdef ORT_ENABLE_STREAM
BFCArena* arena_ptr = static_cast<BFCArena*>(allocator.get());
auto* stream_aware_alloc = StreamAwareArena::FromBFCArena(*arena_ptr);
if (stream_aware_alloc && target_stream) {
size_t len = Tensor::CalculateTensorStorageSize(source_tensor.DataType(), source_tensor.Shape());
p_data = stream_aware_alloc->AllocOnStream(len, target_stream, nullptr);
}
BFCArena* arena_ptr = static_cast<BFCArena*>(allocator.get());
auto* stream_aware_alloc = StreamAwareArena::FromBFCArena(*arena_ptr);
if (stream_aware_alloc && target_stream) {
size_t len = Tensor::CalculateTensorStorageSize(source_tensor.DataType(), source_tensor.Shape());
p_data = stream_aware_alloc->AllocOnStream(len, target_stream, nullptr);
}
#else
ORT_UNUSED_PARAMETER(target_stream);
ORT_UNUSED_PARAMETER(target_stream);
#endif // ORT_ENABLE_STREAM
if (p_data == nullptr) {
Tensor::InitOrtValue(source_tensor.DataType(),
source_tensor.Shape(),
allocator, target_mlvalue);
if (p_data == nullptr) {
Tensor::InitOrtValue(source_tensor.DataType(),
source_tensor.Shape(),
allocator, target_mlvalue);
} else {
Tensor::InitOrtValue(source_tensor.DataType(),
source_tensor.Shape(),
p_data,
allocator, target_mlvalue);
}
} else {
Tensor::InitOrtValue(source_tensor.DataType(),
source_tensor.Shape(),
p_data,
allocator, target_mlvalue);
}
} else {
Tensor::InitOrtValue(source_tensor.DataType(),
source_tensor.Shape(),
allocator, target_mlvalue);
auto type = DataTypeImpl::GetType<Tensor>();

target_mlvalue.Init(nullptr, // This OrtValue is "None" and has no data
type,
type->GetDeleteFunc());
}
} else if (source_mlvalue.IsSparseTensor()) {
#if !defined(DISABLE_SPARSE_TENSORS)
Expand Down Expand Up @@ -173,7 +181,9 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
Tensor* p_output_tensor = target_mlvalue.GetMutable<Tensor>();

if (copy_tensor_pairs != nullptr) {
copy_tensor_pairs->push_back({source_tensor, *p_output_tensor, stream});
if (target_mlvalue.IsAllocated()) {
copy_tensor_pairs->push_back({source_tensor, *p_output_tensor, stream});
}
} else {
ORT_RETURN_IF_ERROR(stream ? session_state.GetDataTransferMgr().CopyTensorAsync(source_tensor, *p_output_tensor, *stream) : session_state.GetDataTransferMgr().CopyTensor(source_tensor, *p_output_tensor));
}
Expand Down
124 changes: 66 additions & 58 deletions onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,67 +44,75 @@ class Memcpy final : public OpKernel {
auto X_type = ctx->InputType(0);
if (X_type->IsTensorType()) {
const auto* X = ctx->Input<Tensor>(0);
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
Tensor* Y = ctx->Output(0, X->Shape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
// do we support async copy?
// The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
// so we don't need the check here.
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
return Status::OK();
} else {
if (X_type->IsSparseTensorType()) {
// TODO: support aysnc copy for sparse tensor
// sync the stream first, since it is a sync memory copy
cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
const auto* X = ctx->Input<SparseTensor>(0);
if (X != nullptr) {
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
SparseTensor* Y = ctx->OutputSparse(0, X->DenseShape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output sparse tensor.");
return X->Copy(Info().GetDataTransferManager(), *Y);
} else if (X_type->IsTensorSequenceType()) {
const TensorSeq* X = ctx->Input<TensorSeq>(0);
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor sequence is nullptr.");
TensorSeq* Y = ctx->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor sequence.");
auto X_dtype = X->DataType();
Y->SetType(X_dtype);
AllocatorPtr alloc;

// If we are copying contents to CUDA, the allocator to use
// to allocate the buffers of the new tensors in the sequence
// can be temp space allocator associated with the CUDA EP
if (Node().OpType() == "MemcpyFromHost") {
auto status = ctx->GetTempSpaceAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get an allocator.");
}
} else {
// If we are copying contents to CPU (op type is "MemcpyToHost"),
// the allocator to use to allocate the buffers of the new tensors
// in the sequence will be the allocator from the CPU EP
auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get the CPU allocator.");
}
Tensor* Y = ctx->Output(0, X->Shape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
// do we support async copy?
// The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
// so we don't need the check here.
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
return Status::OK();
} else {
// If X is null, it means we have a unallocated tensor as result of an empty optional op
// Hence we create an output with unallocated tensor instead of copying
ctx->EmptyOptionalTensorOutput(0);
return Status::OK();
}

} else if (X_type->IsSparseTensorType()) {
// TODO: support aysnc copy for sparse tensor
// sync the stream first, since it is a sync memory copy
cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
const auto* X = ctx->Input<SparseTensor>(0);
// TODO: support empty optional
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
SparseTensor* Y = ctx->OutputSparse(0, X->DenseShape());
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output sparse tensor.");
return X->Copy(Info().GetDataTransferManager(), *Y);
} else if (X_type->IsTensorSequenceType()) {
const TensorSeq* X = ctx->Input<TensorSeq>(0);
// TODO: support empty optional
ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor sequence is nullptr.");
TensorSeq* Y = ctx->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor sequence.");
auto X_dtype = X->DataType();
Y->SetType(X_dtype);
AllocatorPtr alloc;

// If we are copying contents to CUDA, the allocator to use
// to allocate the buffers of the new tensors in the sequence
// can be temp space allocator associated with the CUDA EP
if (Node().OpType() == "MemcpyFromHost") {
auto status = ctx->GetTempSpaceAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get an allocator.");
}
auto X_size = X->Size();
Y->Reserve(X_size);
for (size_t i = 0; i < X_size; ++i) {
const Tensor& source_tensor = X->Get(i);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(), source_tensor.Shape(), alloc);
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(source_tensor.Location().device,
target_tensor->Location().device);
ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(source_tensor, *target_tensor, *ctx->GetComputeStream()));
Y->Add(std::move(*target_tensor));
} else {
// If we are copying contents to CPU (op type is "MemcpyToHost"),
// the allocator to use to allocate the buffers of the new tensors
// in the sequence will be the allocator from the CPU EP
auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get the CPU allocator.");
}
return Status::OK();
}
return Status(common::ONNXRUNTIME, common::FAIL, "Memcpy: Unsupported input type.");
auto X_size = X->Size();
Y->Reserve(X_size);
for (size_t i = 0; i < X_size; ++i) {
const Tensor& source_tensor = X->Get(i);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(), source_tensor.Shape(), alloc);
auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(source_tensor.Location().device,
target_tensor->Location().device);
ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(source_tensor, *target_tensor, *ctx->GetComputeStream()));
Y->Add(std::move(*target_tensor));
}
return Status::OK();
}
return Status(common::ONNXRUNTIME, common::FAIL, "Memcpy: Unsupported input type.");
}
};

Expand All @@ -116,7 +124,7 @@ ONNX_OPERATOR_KERNEL_EX(
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
.InputMemoryType(OrtMemTypeCPUInput, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorAndSequenceTensorTypesIRv9()),
.TypeConstraint("T", DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9()),
Memcpy);

ONNX_OPERATOR_KERNEL_EX(
Expand All @@ -126,7 +134,7 @@ ONNX_OPERATOR_KERNEL_EX(
kCudaExecutionProvider,
(*KernelDefBuilder::Create())
.OutputMemoryType(OrtMemTypeCPUOutput, 0)
.TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorAndSequenceTensorTypesIRv9()),
.TypeConstraint("T", DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9()),
Memcpy);

} // namespace cuda
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -615,6 +615,10 @@ struct ProviderHost {
virtual const std::vector<MLDataType>& DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv4() = 0;
virtual const std::vector<MLDataType>& DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv9() = 0;

virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypes() = 0;
virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4() = 0;
virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9() = 0;

virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypes() = 0;
virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv4() = 0;
virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv9() = 0;
Expand Down Expand Up @@ -780,6 +784,10 @@ struct ProviderHost {
virtual Tensor* OpKernelContext__Output(OpKernelContext* p, int index, const TensorShape& shape) = 0;
#if !defined(DISABLE_SPARSE_TENSORS)
virtual SparseTensor* OpKernelContext__OutputSparse(OpKernelContext* p, int index, const TensorShape& shape) = 0;
#endif
#if !defined(DISABLE_OPTIONAL_TYPE)
virtual void OpKernelContext__EmptyOptionalTensorOutput(OpKernelContext* p, int index) = 0;
virtual void OpKernelContext__EmptyOptionalTensorSeqOutput(OpKernelContext* p, int index) = 0;
#endif
virtual Tensor& OpKernelContext__RequiredOutput(OpKernelContext* p, int index, const TensorShape& shape) = 0;
virtual MLDataType OpKernelContext__InputType(const OpKernelContext* p, int index) = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -577,6 +577,10 @@ class DataTypeImpl final {
static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv4() { return g_host->DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv4(); }
static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv9() { return g_host->DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv9(); }

static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypes() { return g_host->DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypes(); }
static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4() { return g_host->DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4(); }
static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9() { return g_host->DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9(); }

static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypes() { return g_host->DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypes(); }
static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv4() { return g_host->DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv4(); }
static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv9() { return g_host->DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv9(); }
Expand Down Expand Up @@ -813,6 +817,10 @@ struct OpKernelContext final {
Tensor* Output(int index, const TensorShape& shape) { return g_host->OpKernelContext__Output(this, index, shape); }
#if !defined(DISABLE_SPARSE_TENSORS)
SparseTensor* OutputSparse(int index, const TensorShape& shape) { return g_host->OpKernelContext__OutputSparse(this, index, shape); }
#endif
#if !defined(DISABLE_OPTIONAL_TYPE)
void EmptyOptionalTensorOutput(int index) { return g_host->OpKernelContext__EmptyOptionalTensorOutput(this, index); }
void EmptyOptionalTensorSeqOutput(int index) { return g_host->OpKernelContext__EmptyOptionalTensorSeqOutput(this, index); }
#endif
int OutputCount() const { return g_host->OpKernelContext__OutputCount(this); }

Expand Down
Loading

0 comments on commit 10cb01d

Please sign in to comment.