Fix optional used with GPUExecutionProvider

amancini-N · Jan 26, 2024 · 10cb01d · 10cb01d
1 parent 28a16c2
commit 10cb01d
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 77 deletions.
diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h
@@ -234,6 +234,10 @@ class DataTypeImpl {
   static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv4();
   static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv9();
 
+  static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9();
+
   static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
   static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv4();
   static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv9();
@@ -242,6 +246,10 @@ class DataTypeImpl {
   static const std::vector<MLDataType>& AllOptionalTypesIRv4();
   static const std::vector<MLDataType>& AllOptionalTypesIRv9();
 
+  static const std::vector<MLDataType>& AllFixedSizeOptionalTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllFixedSizeOptionalTypesIRv4();
+  static const std::vector<MLDataType>& AllFixedSizeOptionalTypesIRv9();
+
   static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypes();  // up to IR4 (no float 8), deprecated
   static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypesIRv4();
   static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypesIRv9();

diff --git a/onnxruntime/core/framework/data_types.cc b/onnxruntime/core/framework/data_types.cc
@@ -1397,6 +1397,38 @@ const std::vector<MLDataType>& DataTypeImpl::AllOptionalAndTensorAndSequenceTens
   return all_optional_and_tensor_and_sequence_types;
 }
 
+const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypes() {
+  return AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4();
+}
+
+const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4() {
+  static std::vector<MLDataType> all_optional_and_tensor_and_sequence_types =
+      []() {
+        auto temp = AllFixedSizeOptionalTypesIRv4();
+        const auto tensor = AllFixedSizeTensorTypesIRv4();
+        temp.insert(temp.end(), tensor.begin(), tensor.end());
+        const auto& seq = AllFixedSizeSequenceTensorTypesIRv4();
+        temp.insert(temp.end(), seq.begin(), seq.end());
+        return temp;
+      }();
+
+  return all_optional_and_tensor_and_sequence_types;
+}
+
+const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9() {
+  static std::vector<MLDataType> all_optional_and_tensor_and_sequence_types =
+      []() {
+        auto temp = AllFixedSizeOptionalTypesIRv9();
+        const auto tensor = AllFixedSizeTensorTypesIRv9();
+        temp.insert(temp.end(), tensor.begin(), tensor.end());
+        const auto& seq = AllFixedSizeSequenceTensorTypesIRv9();
+        temp.insert(temp.end(), seq.begin(), seq.end());
+        return temp;
+      }();
+
+  return all_optional_and_tensor_and_sequence_types;
+}
+
 const std::vector<MLDataType>& DataTypeImpl::AllOptionalTypes() {
   return AllOptionalTypesIRv4();
 }
@@ -1425,6 +1457,34 @@ const std::vector<MLDataType>& DataTypeImpl::AllOptionalTypesIRv9() {
   return all_optional_types;
 }
 
+const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalTypes() {
+  return AllFixedSizeOptionalTypesIRv4();
+}
+
+const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalTypesIRv4() {
+  static std::vector<MLDataType> all_optional_types =
+      []() {
+        auto temp = GetOptionalTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv4>();
+        const auto& seq = GetOptionalSequenceTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv4>();
+        temp.insert(temp.end(), seq.begin(), seq.end());
+        return temp;
+      }();
+
+  return all_optional_types;
+}
+
+const std::vector<MLDataType>& DataTypeImpl::AllFixedSizeOptionalTypesIRv9() {
+  static std::vector<MLDataType> all_optional_types =
+      []() {
+        auto temp = GetOptionalTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv9>();
+        const auto& seq = GetOptionalSequenceTensorTypesFromTypeList<element_type_lists::AllFixedSizeIRv9>();
+        temp.insert(temp.end(), seq.begin(), seq.end());
+        return temp;
+      }();
+
+  return all_optional_types;
+}
+
 const std::vector<MLDataType>& DataTypeImpl::AllTensorAndSequenceTensorAndOptionalTypes() {
   return AllTensorAndSequenceTensorAndOptionalTypesIRv4();
 }

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
@@ -81,33 +81,41 @@ static common::Status AllocateHelper(const AllocatorPtr& allocator,
   }
 
   if (source_mlvalue.IsTensor()) {
-    const Tensor& source_tensor = source_mlvalue.Get<Tensor>();
-    if (allocator->Info().alloc_type == OrtArenaAllocator) {
-      void* p_data = nullptr;
+    if (source_mlvalue.IsAllocated()) {
+      const Tensor& source_tensor = source_mlvalue.Get<Tensor>();
+      if (allocator->Info().alloc_type == OrtArenaAllocator) {
+        void* p_data = nullptr;
 #ifdef ORT_ENABLE_STREAM
-      BFCArena* arena_ptr = static_cast<BFCArena*>(allocator.get());
-      auto* stream_aware_alloc = StreamAwareArena::FromBFCArena(*arena_ptr);
-      if (stream_aware_alloc && target_stream) {
-        size_t len = Tensor::CalculateTensorStorageSize(source_tensor.DataType(), source_tensor.Shape());
-        p_data = stream_aware_alloc->AllocOnStream(len, target_stream, nullptr);
-      }
+        BFCArena* arena_ptr = static_cast<BFCArena*>(allocator.get());
+        auto* stream_aware_alloc = StreamAwareArena::FromBFCArena(*arena_ptr);
+        if (stream_aware_alloc && target_stream) {
+          size_t len = Tensor::CalculateTensorStorageSize(source_tensor.DataType(), source_tensor.Shape());
+          p_data = stream_aware_alloc->AllocOnStream(len, target_stream, nullptr);
+        }
 #else
-      ORT_UNUSED_PARAMETER(target_stream);
+        ORT_UNUSED_PARAMETER(target_stream);
 #endif  // ORT_ENABLE_STREAM
-      if (p_data == nullptr) {
-        Tensor::InitOrtValue(source_tensor.DataType(),
-                             source_tensor.Shape(),
-                             allocator, target_mlvalue);
+        if (p_data == nullptr) {
+          Tensor::InitOrtValue(source_tensor.DataType(),
+                               source_tensor.Shape(),
+                               allocator, target_mlvalue);
+        } else {
+          Tensor::InitOrtValue(source_tensor.DataType(),
+                               source_tensor.Shape(),
+                               p_data,
+                               allocator, target_mlvalue);
+        }
       } else {
         Tensor::InitOrtValue(source_tensor.DataType(),
                              source_tensor.Shape(),
-                             p_data,
                              allocator, target_mlvalue);
       }
     } else {
-      Tensor::InitOrtValue(source_tensor.DataType(),
-                           source_tensor.Shape(),
-                           allocator, target_mlvalue);
+      auto type = DataTypeImpl::GetType<Tensor>();
+
+      target_mlvalue.Init(nullptr,  // This OrtValue is "None" and has no data
+                          type,
+                          type->GetDeleteFunc());
     }
   } else if (source_mlvalue.IsSparseTensor()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
@@ -173,7 +181,9 @@ static Status BatchOrCopyMLValue(const SessionState& session_state,
     Tensor* p_output_tensor = target_mlvalue.GetMutable<Tensor>();
 
     if (copy_tensor_pairs != nullptr) {
-      copy_tensor_pairs->push_back({source_tensor, *p_output_tensor, stream});
+      if (target_mlvalue.IsAllocated()) {
+        copy_tensor_pairs->push_back({source_tensor, *p_output_tensor, stream});
+      }
     } else {
       ORT_RETURN_IF_ERROR(stream ? session_state.GetDataTransferMgr().CopyTensorAsync(source_tensor, *p_output_tensor, *stream) : session_state.GetDataTransferMgr().CopyTensor(source_tensor, *p_output_tensor));
     }

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -44,67 +44,75 @@ class Memcpy final : public OpKernel {
     auto X_type = ctx->InputType(0);
     if (X_type->IsTensorType()) {
       const auto* X = ctx->Input<Tensor>(0);
-      ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
-      Tensor* Y = ctx->Output(0, X->Shape());
-      ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
-      // do we support async copy?
-      // The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
-      // so we don't need the check here.
-      auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
-      ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
-      return Status::OK();
-    } else {
-      if (X_type->IsSparseTensorType()) {
-        // TODO: support aysnc copy for sparse tensor
-        // sync the stream first, since it is a sync memory copy
-        cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
-        const auto* X = ctx->Input<SparseTensor>(0);
+      if (X != nullptr) {
         ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
-        SparseTensor* Y = ctx->OutputSparse(0, X->DenseShape());
-        ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output sparse tensor.");
-        return X->Copy(Info().GetDataTransferManager(), *Y);
-      } else if (X_type->IsTensorSequenceType()) {
-        const TensorSeq* X = ctx->Input<TensorSeq>(0);
-        ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor sequence is nullptr.");
-        TensorSeq* Y = ctx->Output<TensorSeq>(0);
-        ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor sequence.");
-        auto X_dtype = X->DataType();
-        Y->SetType(X_dtype);
-        AllocatorPtr alloc;
-
-        // If we are copying contents to CUDA, the allocator to use
-        // to allocate the buffers of the new tensors in the sequence
-        // can be temp space allocator associated with the CUDA EP
-        if (Node().OpType() == "MemcpyFromHost") {
-          auto status = ctx->GetTempSpaceAllocator(&alloc);
-          if (!status.IsOK()) {
-            return Status(common::ONNXRUNTIME, common::FAIL,
-                          "Memcpy cuda: unable to get an allocator.");
-          }
-        } else {
-          // If we are copying contents to CPU (op type is "MemcpyToHost"),
-          // the allocator to use to allocate the buffers of the new tensors
-          // in the sequence will be the allocator from the CPU EP
-          auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
-          if (!status.IsOK()) {
-            return Status(common::ONNXRUNTIME, common::FAIL,
-                          "Memcpy cuda: unable to get the CPU allocator.");
-          }
+        Tensor* Y = ctx->Output(0, X->Shape());
+        ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor.");
+        // do we support async copy?
+        // The cudaMemCpyAsync will handle the pinned memory and non-pinned memory,
+        // so we don't need the check here.
+        auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device);
+        ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(*X, *Y, *ctx->GetComputeStream()));
+        return Status::OK();
+      } else {
+        // If X is null, it means we have a unallocated tensor as result of an empty optional op
+        // Hence we create an output with unallocated tensor instead of copying
+        ctx->EmptyOptionalTensorOutput(0);
+        return Status::OK();
+      }
+
+    } else if (X_type->IsSparseTensorType()) {
+      // TODO: support aysnc copy for sparse tensor
+      // sync the stream first, since it is a sync memory copy
+      cudaStreamSynchronize(static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()));
+      const auto* X = ctx->Input<SparseTensor>(0);
+      // TODO: support empty optional
+      ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor is nullptr.");
+      SparseTensor* Y = ctx->OutputSparse(0, X->DenseShape());
+      ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output sparse tensor.");
+      return X->Copy(Info().GetDataTransferManager(), *Y);
+    } else if (X_type->IsTensorSequenceType()) {
+      const TensorSeq* X = ctx->Input<TensorSeq>(0);
+      // TODO: support empty optional
+      ORT_ENFORCE(X != nullptr, "Memcpy: Input tensor sequence is nullptr.");
+      TensorSeq* Y = ctx->Output<TensorSeq>(0);
+      ORT_ENFORCE(Y != nullptr, "Memcpy: Failed to allocate output tensor sequence.");
+      auto X_dtype = X->DataType();
+      Y->SetType(X_dtype);
+      AllocatorPtr alloc;
+
+      // If we are copying contents to CUDA, the allocator to use
+      // to allocate the buffers of the new tensors in the sequence
+      // can be temp space allocator associated with the CUDA EP
+      if (Node().OpType() == "MemcpyFromHost") {
+        auto status = ctx->GetTempSpaceAllocator(&alloc);
+        if (!status.IsOK()) {
+          return Status(common::ONNXRUNTIME, common::FAIL,
+                        "Memcpy cuda: unable to get an allocator.");
         }
-        auto X_size = X->Size();
-        Y->Reserve(X_size);
-        for (size_t i = 0; i < X_size; ++i) {
-          const Tensor& source_tensor = X->Get(i);
-          std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(), source_tensor.Shape(), alloc);
-          auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(source_tensor.Location().device,
-                                                                                    target_tensor->Location().device);
-          ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(source_tensor, *target_tensor, *ctx->GetComputeStream()));
-          Y->Add(std::move(*target_tensor));
+      } else {
+        // If we are copying contents to CPU (op type is "MemcpyToHost"),
+        // the allocator to use to allocate the buffers of the new tensors
+        // in the sequence will be the allocator from the CPU EP
+        auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
+        if (!status.IsOK()) {
+          return Status(common::ONNXRUNTIME, common::FAIL,
+                        "Memcpy cuda: unable to get the CPU allocator.");
         }
-        return Status::OK();
       }
-      return Status(common::ONNXRUNTIME, common::FAIL, "Memcpy: Unsupported input type.");
+      auto X_size = X->Size();
+      Y->Reserve(X_size);
+      for (size_t i = 0; i < X_size; ++i) {
+        const Tensor& source_tensor = X->Get(i);
+        std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(), source_tensor.Shape(), alloc);
+        auto* gpu_data_transfer = Info().GetDataTransferManager().GetDataTransfer(source_tensor.Location().device,
+                                                                                  target_tensor->Location().device);
+        ORT_RETURN_IF_ERROR(gpu_data_transfer->CopyTensorAsync(source_tensor, *target_tensor, *ctx->GetComputeStream()));
+        Y->Add(std::move(*target_tensor));
+      }
+      return Status::OK();
     }
+    return Status(common::ONNXRUNTIME, common::FAIL, "Memcpy: Unsupported input type.");
   }
 };
 
@@ -116,7 +124,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kCudaExecutionProvider,
     (*KernelDefBuilder::Create())
         .InputMemoryType(OrtMemTypeCPUInput, 0)
-        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorAndSequenceTensorTypesIRv9()),
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9()),
     Memcpy);
 
 ONNX_OPERATOR_KERNEL_EX(
@@ -126,7 +134,7 @@ ONNX_OPERATOR_KERNEL_EX(
     kCudaExecutionProvider,
     (*KernelDefBuilder::Create())
         .OutputMemoryType(OrtMemTypeCPUOutput, 0)
-        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorAndSequenceTensorTypesIRv9()),
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9()),
     Memcpy);
 
 }  // namespace cuda

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -615,6 +615,10 @@ struct ProviderHost {
   virtual const std::vector<MLDataType>& DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv4() = 0;
   virtual const std::vector<MLDataType>& DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv9() = 0;
 
+  virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypes() = 0;
+  virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4() = 0;
+  virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9() = 0;
+
   virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypes() = 0;
   virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv4() = 0;
   virtual const std::vector<MLDataType>& DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv9() = 0;
@@ -780,6 +784,10 @@ struct ProviderHost {
   virtual Tensor* OpKernelContext__Output(OpKernelContext* p, int index, const TensorShape& shape) = 0;
 #if !defined(DISABLE_SPARSE_TENSORS)
   virtual SparseTensor* OpKernelContext__OutputSparse(OpKernelContext* p, int index, const TensorShape& shape) = 0;
+#endif
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  virtual void OpKernelContext__EmptyOptionalTensorOutput(OpKernelContext* p, int index) = 0;
+  virtual void OpKernelContext__EmptyOptionalTensorSeqOutput(OpKernelContext* p, int index) = 0;
 #endif
   virtual Tensor& OpKernelContext__RequiredOutput(OpKernelContext* p, int index, const TensorShape& shape) = 0;
   virtual MLDataType OpKernelContext__InputType(const OpKernelContext* p, int index) = 0;

diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -577,6 +577,10 @@ class DataTypeImpl final {
   static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv4() { return g_host->DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv4(); }
   static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv9() { return g_host->DataTypeImpl__AllOptionalAndTensorAndSequenceTensorTypesIRv9(); }
 
+  static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypes() { return g_host->DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypes(); }
+  static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4() { return g_host->DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv4(); }
+  static const std::vector<MLDataType>& AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9() { return g_host->DataTypeImpl__AllFixedSizeOptionalAndTensorAndSequenceTensorTypesIRv9(); }
+
   static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypes() { return g_host->DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypes(); }
   static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv4() { return g_host->DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv4(); }
   static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv9() { return g_host->DataTypeImpl__AllFixedSizeTensorAndSequenceTensorTypesIRv9(); }
@@ -813,6 +817,10 @@ struct OpKernelContext final {
   Tensor* Output(int index, const TensorShape& shape) { return g_host->OpKernelContext__Output(this, index, shape); }
 #if !defined(DISABLE_SPARSE_TENSORS)
   SparseTensor* OutputSparse(int index, const TensorShape& shape) { return g_host->OpKernelContext__OutputSparse(this, index, shape); }
+#endif
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  void EmptyOptionalTensorOutput(int index) { return g_host->OpKernelContext__EmptyOptionalTensorOutput(this, index); }
+  void EmptyOptionalTensorSeqOutput(int index) { return g_host->OpKernelContext__EmptyOptionalTensorSeqOutput(this, index); }
 #endif
   int OutputCount() const { return g_host->OpKernelContext__OutputCount(this); }