From e1c14ebc2dcb7a53f77cccd196c866301275d002 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Zientkiewicz?= <mzient@gmail.com>
Date: Mon, 27 Jan 2025 17:25:59 +0100
Subject: [PATCH] Tensor list.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
---
 dali/c_api_2/data_objects.cc      | 175 +++++++++++++++++++++++
 dali/c_api_2/data_objects.h       | 229 +++++++++++++++++++++++++++---
 dali/c_api_2/data_objects_test.cc |  71 +++++++++
 dali/c_api_2/error_handling.h     |   6 +
 dali/c_api_2/managed_handle.h     | 122 ++++++++++++++++
 dali/c_api_2/ref_counting.h       |  92 ++++++++++++
 include/dali/dali.h               |  16 +++
 7 files changed, 694 insertions(+), 17 deletions(-)
 create mode 100644 dali/c_api_2/data_objects.cc
 create mode 100644 dali/c_api_2/data_objects_test.cc
 create mode 100644 dali/c_api_2/managed_handle.h
diff --git a/dali/c_api_2/data_objects.cc b/dali/c_api_2/data_objects.cc
new file mode 100644
index 0000000000..fda220f4c8
--- /dev/null
+++ b/dali/c_api_2/data_objects.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/c_api_2/data_objects.h"
+#include "dali/c_api_2/error_handling.h"
+
+namespace dali::c_api {
+
+RefCountedPtr<TensorListInterface> TensorListInterface::Create(daliBufferPlacement_t placement) {
+  switch (placement.device_type) {
+    case DALI_STORAGE_CPU:
+    {
+      auto tl = std::make_shared<TensorList<CPUBackend>>();
+      tl->set_pinned(placement.pinned);
+      if (placement.pinned)
+        tl->set_device_id(placement.device_id);
+      return Wrap(std::move(tl));
+    }
+    case DALI_STORAGE_GPU:
+    {
+      auto tl = std::make_shared<TensorList<GPUBackend>>();
+      tl->set_pinned(placement.pinned);
+      tl->set_device_id(placement.device_id);
+      return Wrap(std::move(tl));
+    }
+    default:
+      throw std::invalid_argument(make_string("Invalid storage device: ", placement.device_type));
+  }
+}
+
+TensorListInterface *ToPointer(daliTensorList_h handle) {
+  if (!handle)
+    throw NullHandle("TensorList");
+  return static_cast<TensorListInterface *>(handle);
+}
+
+}  // namespace dali::c_api
+
+using namespace dali::c_api;  // NOLINT
+
+daliResult_t daliTensorListCreate(daliTensorList_h *out, daliBufferPlacement_t placement) {
+  DALI_PROLOG();
+  auto tl = dali::c_api::TensorListInterface::Create(placement);
+  *out = tl.release();  // no throwing allowed after this line!
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListIncRef(daliTensorList_h tl, int *new_ref) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tl);
+  int r = ptr->IncRef();
+  if (new_ref)
+    *new_ref = r;
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListDecRef(daliTensorList_h tl, int *new_ref) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tl);
+  int r = ptr->DecRef();
+  if (new_ref)
+    *new_ref = r;
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListRefCount(daliTensorList_h tl, int *ref) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tl);
+  if (!ref)
+    throw std::invalid_argument("The output pointer must not be NULL.");
+  int r = ptr->RefCount();
+  *ref = r;
+  DALI_EPILOG();
+}
+
+DALI_API daliResult_t daliTensorListAttachBuffer(
+      daliTensorList_h tensor_list,
+      int num_samples,
+      int ndim,
+      daliDataType_t dtype,
+      const char *layout,
+      const int64_t *shapes,
+      void *data,
+      const ptrdiff_t *sample_offsets,
+      daliDeleter_t deleter) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  ptr->AttachBuffer(num_samples, ndim, dtype, layout, shapes, data, sample_offsets, deleter);
+  DALI_EPILOG();
+}
+
+DALI_API daliResult_t daliTensorListAttachSamples(
+      daliTensorList_h tensor_list,
+      int num_samples,
+      int ndim,
+      daliDataType_t dtype,
+      const char *layout,
+      const daliTensorDesc_t *samples,
+      const daliDeleter_t *sample_deleters) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  ptr->AttachSamples(num_samples, ndim, dtype, layout, samples, sample_deleters);
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListResize(
+      daliTensorList_h tensor_list,
+      int num_samples,
+      int ndim,
+      daliDataType_t dtype,
+      const int64_t *shapes) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  ptr->Resize(num_samples, ndim, dtype, shapes);
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListSetLayout(
+      daliTensorList_h tensor_list,
+      const char *layout) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  ptr->SetLayout(layout);
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListGetLayout(
+      daliTensorList_h tensor_list,
+      const char **layout) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  if (!layout)
+    throw std::invalid_argument("The output parameter `layout` must not be be NULL");
+  *layout = ptr->GetLayout();
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListGetStream(
+      daliTensorList_h tensor_list,
+      cudaStream_t *out_stream) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  if (!out_stream)
+    throw std::invalid_argument("The output parameter `out_stream` must not be NULL");
+  auto str = ptr->GetStream();
+  *out_stream = str.has_value() ? *str : cudaStream_t(-1);
+  return str.has_value() ? DALI_SUCCESS : DALI_NO_DATA;
+  DALI_EPILOG();
+}
+
+daliResult_t daliTensorListSetStream(
+      daliTensorList_h tensor_list,
+      const cudaStream_t *stream,
+      daliBool synchronize) {
+  DALI_PROLOG();
+  auto *ptr = ToPointer(tensor_list);
+  std::optional<cudaStream_t> opt_str;
+  if (stream)
+    opt_str = *stream;
+  else
+    opt_str = std::nullopt;
+  ptr->SetStream(opt_str, synchronize);
+  DALI_EPILOG();
+}
diff --git a/dali/c_api_2/data_objects.h b/dali/c_api_2/data_objects.h
index bacbe7167f..583fdbf5b5 100644
--- a/dali/c_api_2/data_objects.h
+++ b/dali/c_api_2/data_objects.h
@@ -15,15 +15,20 @@
 #ifndef DALI_C_API_2_DATA_OBJECTS_H_
 #define DALI_C_API_2_DATA_OBJECTS_H_
 
+#include <cstdint>
+#include <memory>
 #include <optional>
 #include "dali/dali.h"
 #include "dali/pipeline/data/tensor_list.h"
 #include "dali/c_api_2/ref_counting.h"
+#include "dali/core/tensor_shape_print.h"
+
+struct _DALITensorList {};
 
 namespace dali {
 namespace c_api {
 
-class TensorListInterface : public RefCountedObject {
+class TensorListInterface : public _DALITensorList, public RefCountedObject {
  public:
   virtual ~TensorListInterface() = default;
 
@@ -37,6 +42,7 @@ class TensorListInterface : public RefCountedObject {
       int num_samples,
       int ndim,
       daliDataType_t dtype,
+      const char *layout,
       const int64_t *shapes,
       void *data,
       const ptrdiff_t *sample_offsets,
@@ -46,21 +52,28 @@ class TensorListInterface : public RefCountedObject {
       int num_samples,
       int ndim,
       daliDataType_t dtype,
+      const char *layout,
       const daliTensorDesc_t *samples,
       const daliDeleter_t *sample_deleters) = 0;
 
   virtual daliBufferPlacement_t GetBufferPlacement() const = 0;
 
+  virtual const char *GetLayout() const = 0;
+
+  virtual void SetLayout(const char *layout) = 0;
+
   virtual void SetStream(std::optional<cudaStream_t> stream, bool synchronize) = 0;
 
   virtual std::optional<cudaStream_t> GetStream() const = 0;
 
-  virtual std::optional<cudaEvent_t> GetReadyEvent() const() = 0;
+  virtual std::optional<cudaEvent_t> GetReadyEvent() const = 0;
 
   virtual cudaEvent_t GetOrCreateReadyEvent() = 0;
+
+  static RefCountedPtr<TensorListInterface> Create(daliBufferPlacement_t placement);
 };
 
-struct TensorListDeleter {
+struct BufferDeleter {
   daliDeleter_t deleter;
   AccessOrder deletion_order;
 
@@ -71,7 +84,7 @@ struct TensorListDeleter {
                             deletion_order.is_device() ? &stream : nullptr);
     }
     if (deleter.destroy_context) {
-      deleter.destroy_context(deleter.destroy_context);
+      deleter.destroy_context(deleter.deleter_ctx);
     }
   }
 };
@@ -86,38 +99,90 @@ class TensorListWrapper : public TensorListInterface {
       int ndim,
       daliDataType_t dtype,
       const int64_t *shapes) override {
-    tl_->Resize(TensorListShape<>(make_cspan(shapes, num_samples*ndim), num_samples, ndim), dtype);
+    std::vector<int64_t> shape_data(shapes, shapes + ndim * num_samples);
+    tl_->Resize(TensorListShape<>(shape_data, num_samples, ndim), dtype);
   }
 
   void AttachBuffer(
       int num_samples,
       int ndim,
       daliDataType_t dtype,
+      const char *layout,
       const int64_t *shapes,
       void *data,
       const ptrdiff_t *sample_offsets,
       daliDeleter_t deleter) override {
+
+    if (num_samples < 0)
+      throw std::invalid_argument("The number of samples must not be negative.");
+    if (ndim < 0)
+      throw std::invalid_argument("The number of dimensions must not be negative.");
+    if (!shapes && ndim >= 0)
+      throw std::invalid_argument("The `shapes` are required for non-scalar (ndim>=0) samples.");
+    if (!data && num_samples > 0) {
+      for (int i = 0; i < num_samples; i++) {
+        auto sample_shape = make_cspan(&shapes[i*ndim], ndim);
+        for (int j = 0; j < ndim; j++)
+          if (sample_shape[j] < 0)
+            throw std::invalid_argument(make_string(
+              "Negative extent encountered in the shape of sample ", i, ". Offending shape: ",
+              TensorShape<-1>(sample_shape)));
+        if (volume(sample_shape) > 0)
+          throw std::invalid_argument(
+            "The pointer to the data buffer must not be null for a non-empty tensor list.");
+        if (sample_offsets && sample_offsets[i])
+          throw std::invalid_argument(
+            "All sample_offsets must be zero when the data pointer is NULL.");
+      }
+    }
+
+    TensorLayout new_layout = {};
+
+    if (!layout) {
+      if (ndim == tl_->sample_dim())
+        new_layout = tl_->GetLayout();
+    } else {
+      new_layout = layout;
+      if (new_layout.ndim() != ndim)
+        throw std::invalid_argument(make_string(
+          "The layout '", new_layout, "' cannot describe ", ndim, "-dimensional data."));
+    }
+
     tl_->Reset();
     tl_->SetSize(num_samples);
     tl_->set_sample_dim(ndim);
-    ptridff_t next_offset = 0;
+    tl_->SetLayout(new_layout);
+    ptrdiff_t next_offset = 0;
     auto type_info = TypeTable::GetTypeInfo(dtype);
     auto element_size = type_info.size();
-    std::shared_ptr<void *> buffer;
+
+    std::shared_ptr<void> buffer;
     if (!deleter.delete_buffer && !deleter.destroy_context) {
-      buffer.reset(buffer, [](void *){});
+      buffer = std::shared_ptr<void>(data, [](void *){});
     } else {
-      buffer.reset(buffer, TensorListDeleter{deleter, order()});
+      buffer = std::shared_ptr<void>(data, BufferDeleter{deleter, tl_->order()});
     }
+
     for (int i = 0; i < num_samples; i++) {
-      TensorShape<> sample_shape(make_cspan(&shapes[i*ndim]. ndim));
+      TensorShape<> sample_shape(make_cspan(&shapes[i*ndim], ndim));
       void *sample_data;
+      size_t sample_bytes = volume(sample_shape) * element_size;
       if (sample_offsets) {
         sample_data = static_cast<char *>(data) + sample_offsets[i];
       } else {
         sample_data = static_cast<char *>(data) + next_offset;
-        next_offset += volme(sample_shape) * element_size;
+        next_offset += sample_bytes;
       }
+      tl_->SetSample(
+        i,
+        std::shared_ptr<void>(buffer, sample_data),
+        sample_bytes,
+        tl_->is_pinned(),
+        sample_shape,
+        dtype,
+        tl_->device_id(),
+        tl_->order(),
+        new_layout);
     }
   }
 
@@ -125,24 +190,154 @@ class TensorListWrapper : public TensorListInterface {
       int num_samples,
       int ndim,
       daliDataType_t dtype,
+      const char *layout,
       const daliTensorDesc_t *samples,
       const daliDeleter_t *sample_deleters) {
+    if (num_samples < 0)
+      throw std::invalid_argument("The number of samples must not be negative.");
+    if (num_samples > 0 && !samples)
+      throw std::invalid_argument("The pointer to sample descriptors must not be NULL.");
+    if (ndim < 0) {
+      if (num_samples == 0)
+        throw std::invalid_argument(
+          "The number of dimensions must not be negative when num_samples is 0.");
+      else
+        ndim = samples[0].ndim;
+    }
+
+    for (int i = 0; i < num_samples; i++) {
+      if (samples[i].ndim != ndim)
+        throw std::invalid_argument(make_string(
+            "Invalid `ndim` at sample ", i, ": got ", samples[i].ndim, ", expected ", ndim, "."));
+      if (ndim && !samples[i].shape)
+        throw std::invalid_argument(make_string("Got NULL shape in sample ", i, "."));
+
+      for (int j = 0; j < ndim; j++)
+        if (samples[i].shape[j] < 0) {
+          TensorShape<> sample_shape(make_cspan(samples[i].shape, samples[i].ndim));
+          throw std::invalid_argument(make_string(
+            "Negative extent encountered in the shape of sample ", i, ". Offending shape: ",
+            sample_shape));
+        }
+
+      if (!samples[i].data && volume(make_cspan(samples[i].shape, ndim)))
+        throw std::invalid_argument(make_string(
+            "Got NULL data pointer in a non-empty sample ", i, "."));
+    }
+
+    TensorLayout new_layout = {};
+
+    if (!layout) {
+      if (ndim == tl_->sample_dim())
+        new_layout = tl_->GetLayout();
+    } else {
+      new_layout = layout;
+      if (new_layout.ndim() != ndim)
+        throw std::invalid_argument(make_string(
+          "The layout '", new_layout, "' cannot describe ", ndim, "-dimensional data."));
+    }
+
+    tl_->Reset();
+    tl_->SetSize(num_samples);
+    tl_->set_sample_dim(ndim);
+    tl_->SetLayout(new_layout);
+
+    auto deletion_order = tl_->order();
+
+    auto type_info = TypeTable::GetTypeInfo(dtype);
+    auto element_size = type_info.size();
+    for (int i = 0; i < num_samples; i++) {
+      TensorShape<> sample_shape(make_cspan(samples[i].shape, samples[i].ndim));
+      size_t sample_bytes = volume(sample_shape) * element_size;
+      std::shared_ptr<void> sample_ptr;
+      if (sample_deleters) {
+        sample_ptr = std::shared_ptr<void>(
+          samples[i].data,
+          BufferDeleter{sample_deleters[i], deletion_order});
+      } else {
+        sample_ptr = std::shared_ptr<void>(samples[i].data, [](void*) {});
+      }
 
+      tl_->SetSample(
+        i,
+        sample_ptr,
+        sample_bytes,
+        tl_->is_pinned(),
+        sample_shape,
+        dtype,
+        tl_->device_id(),
+        tl_->order(),
+        new_layout);
+    }
   }
 
-  virtual daliBufferPlacement_t GetBufferPlacement() const = 0;
+  daliBufferPlacement_t GetBufferPlacement() const override {
+    daliBufferPlacement_t placement;
+    placement.device_id = tl_->device_id();
+    StorageDevice dev = backend_to_storage_device<Backend>::value;
+    placement.device_type = static_cast<daliStorageDevice_t>(dev);
+    placement.pinned = tl_->is_pinned();
+    return placement;
+  }
 
-  virtual void SetStream(std::optional<cudaStream_t> stream, bool synchronize) = 0;
+  void SetStream(std::optional<cudaStream_t> stream, bool synchronize) override {
+    tl_->set_order(stream.has_value() ? AccessOrder(*stream) : AccessOrder::host(), synchronize);
+  }
 
-  virtual std::optional<cudaStream_t> GetStream() const = 0;
+  void SetLayout(const char *layout_string) {
+    if (layout_string) {
+      TensorLayout layout(layout_string);
+      if (layout.ndim() != tl_->sample_dim())
+        throw std::invalid_argument(make_string(
+          "The layout '", layout, "' cannot describe ", tl_->sample_dim(), "-dimensional data."));
+      tl_->SetLayout(layout);
+    } else {
+      tl_->SetLayout("");
+    }
+  }
 
-  virtual std::optional<cudaEvent_t> GetReadyEvent() const() = 0;
+  const char *GetLayout() const override {
+    auto &layout = tl_->GetLayout();
+    return !layout.empty() ? layout.data() : nullptr;
+  }
 
-  virtual cudaEvent_t GetOrCreateReadyEvent() = 0;
+  std::optional<cudaStream_t> GetStream() const override {
+    auto o = tl_->order();
+    if (o.is_device())
+      return o.stream();
+    else
+      return std::nullopt;
+  }
+
+  std::optional<cudaEvent_t> GetReadyEvent() const override {
+    auto &e = tl_->ready_event();
+    if (e)
+      return e.get();
+    else
+      return std::nullopt;
+  }
+
+  cudaEvent_t GetOrCreateReadyEvent() override {
+    auto &e = tl_->ready_event();
+    if (e)
+      return e.get();
+    int device_id = tl_->device_id();
+    if (device_id < 0)
+      throw std::runtime_error("The tensor list is not associated with a CUDA device.");
+    tl_->set_ready_event(CUDASharedEvent::Create(device_id));
+    return tl_->ready_event().get();
+  }
  private:
-  std::shared_ptr<TensorList<Backend>> impl_;
+  std::shared_ptr<TensorList<Backend>> tl_;
 };
 
+template <typename Backend>
+RefCountedPtr<TensorListWrapper<Backend>> Wrap(std::shared_ptr<TensorList<Backend>> tl) {
+  return RefCountedPtr<TensorListWrapper<Backend>>(new TensorListWrapper<Backend>(std::move(tl)));
+}
+
+TensorListInterface *ToPointer(daliTensorList_h handle);
+
 }  // namespace c_api
 }  // namespace dali
 
diff --git a/dali/c_api_2/data_objects_test.cc b/dali/c_api_2/data_objects_test.cc
new file mode 100644
index 0000000000..053fbf16e0
--- /dev/null
+++ b/dali/c_api_2/data_objects_test.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/c_api_2/data_objects.h"
+#include <gtest/gtest.h>
+#include "dali/c_api_2/managed_handle.h"
+
+TEST(CAPI2_TensorListTest, NullHandle) {
+  daliTensorList_h h = nullptr;
+  int ref = 0;
+  EXPECT_EQ(DALI_ERROR_INVALID_HANDLE, daliTensorListIncRef(h, &ref));
+  EXPECT_EQ(DALI_ERROR_INVALID_HANDLE, daliTensorListDecRef(h, &ref));
+  EXPECT_EQ(DALI_ERROR_INVALID_HANDLE, daliTensorListRefCount(h, &ref));
+}
+
+TEST(CAPI2_TensorListTest, CreateDestroy) {
+  daliBufferPlacement_t placement{};
+  placement.device_type = DALI_STORAGE_CPU;
+  placement.pinned = false;
+  daliTensorList_h h = nullptr;
+  daliResult_t r = daliTensorListCreate(&h, placement);
+  ASSERT_NE(h, nullptr);
+  dali::c_api::TensorListHandle tl(h);
+  ASSERT_EQ(h, tl.get());
+  ASSERT_EQ(r, DALI_SUCCESS);
+
+  int ref = -1;
+  EXPECT_EQ(daliTensorListRefCount(h, &ref), DALI_SUCCESS);
+  EXPECT_EQ(ref, 1);
+  ref = -1;
+
+  h = tl.release();
+  EXPECT_EQ(daliTensorListDecRef(h, &ref), DALI_SUCCESS);
+  EXPECT_EQ(ref, 0);
+}
+
+inline auto CreateTensorList(daliBufferPlacement_t placement) {
+  auto tl = dali::c_api::TensorListInterface::Create(placement);
+  return dali::c_api::TensorListHandle(tl.release());
+}
+
+TEST(CAPI2_TensorListTest, Resize) {
+  daliBufferPlacement_t placement{};
+  placement.device_type = DALI_STORAGE_GPU;
+  auto tl = CreateTensorList(placement);
+  int64_t shapes[] = {
+    480, 640, 3,
+    600, 800, 3,
+    348, 720, 1,  //
+  };
+  EXPECT_EQ(daliTensorListResize(tl, 4, 3, DALI_UINT32, nullptr), DALI_ERROR_INVALID_ARGUMENT);
+  EXPECT_EQ(daliTensorListResize(tl, -1, 3, DALI_UINT32, shapes), DALI_ERROR_INVALID_ARGUMENT);
+  EXPECT_EQ(daliTensorListResize(tl, 4, -1, DALI_UINT32, shapes), DALI_ERROR_INVALID_ARGUMENT);
+  shapes[0] = -1;
+  EXPECT_EQ(daliTensorListResize(tl, 4, 3, DALI_UINT32, shapes), DALI_ERROR_INVALID_ARGUMENT);
+  shapes[0] = 480;
+  EXPECT_EQ(daliTensorListResize(tl, 4, 3, DALI_UINT32, shapes), DALI_SUCCESS);
+
+  
+}
diff --git a/dali/c_api_2/error_handling.h b/dali/c_api_2/error_handling.h
index 30482e0aed..9101a448dd 100644
--- a/dali/c_api_2/error_handling.h
+++ b/dali/c_api_2/error_handling.h
@@ -32,6 +32,12 @@ class InvalidHandle : public std::invalid_argument {
   InvalidHandle(const char *what) : std::invalid_argument(what) {}
 };
 
+inline InvalidHandle NullHandle() { return InvalidHandle("The handle must not be NULL."); }
+
+inline InvalidHandle NullHandle(const char *what_handle) {
+  return InvalidHandle(make_string("The ", what_handle, " handle must not be NULL."));
+}
+
 }  // namespace c_api
 }  // namespace dali
 
diff --git a/dali/c_api_2/managed_handle.h b/dali/c_api_2/managed_handle.h
new file mode 100644
index 0000000000..b230df8f68
--- /dev/null
+++ b/dali/c_api_2/managed_handle.h
@@ -0,0 +1,122 @@
+    // Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_C_API_2_MANAGED_HANDLE_H_
+#define DALI_C_API_2_MANAGED_HANDLE_H_
+
+#include "dali/dali.h"
+#include "dali/core/unique_handle.h"
+
+namespace dali::c_api {
+
+template <typename HandleType, typename Actual>
+class RefCountedHandle {
+ public:
+  using handle_type = HandleType;
+  static constexpr handle_type null_handle() { return 0; }
+
+  constexpr RefCountedHandle() : handle_(Actual::null_handle()) {}
+  constexpr explicit RefCountedHandle(handle_type h) : handle_(h) {}
+  ~RefCountedHandle() { reset(); }
+
+  RefCountedHandle(const RefCountedHandle &h) {
+    handle_ = h.handle_;
+    if (*this)
+        Actual::IncRef(handle_);
+  }
+
+  RefCountedHandle(RefCountedHandle &&h) noexcept {
+    handle_ = h.handle_;
+    h.handle_ = Actual::null_handle();
+  }
+
+  RefCountedHandle &operator=(const RefCountedHandle &other) {
+    if (other.handle_) {
+        Actual::IncRef(other.handle_);
+    }
+    reset();
+    handle_ = other.handle_;
+    return *this;
+  }
+
+  RefCountedHandle &operator=(RefCountedHandle &&other) noexcept {
+    std::swap(handle_, other.handle_);
+    other.reset();
+    return *this;
+  }
+
+  void reset() noexcept {
+    if (*this)
+        Actual::DecRef(handle_);
+    handle_ = Actual::null_handle();
+  }
+
+  [[nodiscard]] handle_type release() noexcept {
+    auto h = handle_;
+    handle_ = Actual::null_handle();
+    return h;
+  }
+
+  handle_type get() const noexcept { return handle_; }
+  operator handle_type() const noexcept { return get(); }
+
+  explicit operator bool() const noexcept { return handle_ != Actual::null_handle(); }
+
+ private:
+  handle_type handle_;
+};
+
+#define DALI_C_UNIQUE_HANDLE(Resource) \
+class Resource##Handle : public dali::UniqueHandle<dali##Resource##_h, Resource##Handle> { \
+ public: \
+  using UniqueHandle<dali##Resource##_h, Resource##Handle>::UniqueHandle; \
+  static void DestroyHandle(dali##Resource##_h h) { \
+    auto result = dali##Resource##Destroy(h); \
+    if (result != DALI_SUCCESS) { \
+        throw std::runtime_error(daliGetLastErrorMessage()); \
+    } \
+  } \
+}
+
+#define DALI_C_REF_HANDLE(Resource) \
+class Resource##Handle \
+: public dali::c_api::RefCountedHandle<dali##Resource##_h, Resource##Handle> { \
+ public: \
+  using RefCountedHandle<dali##Resource##_h, Resource##Handle>::RefCountedHandle; \
+  static int IncRef(dali##Resource##_h h) { \
+    int ref = 0; \
+    auto result = dali##Resource##IncRef(h, &ref); \
+    if (result != DALI_SUCCESS) { \
+        throw std::runtime_error(daliGetLastErrorMessage()); \
+    } \
+    return ref; \
+  } \
+  static int DecRef(dali##Resource##_h h) { \
+    int ref = 0; \
+    auto result = dali##Resource##DecRef(h, &ref); \
+    if (result != DALI_SUCCESS) { \
+        throw std::runtime_error(daliGetLastErrorMessage()); \
+    } \
+    return ref; \
+  } \
+}
+
+DALI_C_UNIQUE_HANDLE(Pipeline);
+DALI_C_UNIQUE_HANDLE(PipelineOutputs);
+DALI_C_REF_HANDLE(TensorList);
+
+
+}  // namespace dali::c_api
+
+#endif  // DALI_C_API_2_MANAGED_HANDLE_H_
diff --git a/dali/c_api_2/ref_counting.h b/dali/c_api_2/ref_counting.h
index 611f4bf8e0..11df801388 100644
--- a/dali/c_api_2/ref_counting.h
+++ b/dali/c_api_2/ref_counting.h
@@ -15,10 +15,102 @@
 #ifndef DALI_C_API_2_REF_COUNTING_H_
 #define DALI_C_API_2_REF_COUNTING_H_
 
+#include <atomic>
+#include <type_traits>
+#include <utility>
+
 namespace dali::c_api {
 
 class RefCountedObject {
  public:
+  int IncRef() noexcept {
+    return std::atomic_fetch_add_explicit(&ref_, 1, std::memory_order_relaxed) + 1;
+  }
+
+  int DecRef() noexcept {
+    int ret = std::atomic_fetch_sub_explicit(&ref_, 1, std::memory_order_acq_rel) - 1;
+    if (!ret)
+        delete this;
+    return ret;
+  }
+
+  int RefCount() const noexcept {
+    return ref_.load(std::memory_order_relaxed);
+  }
+
+  virtual ~RefCountedObject() = default;
+ private:
+  std::atomic<int> ref_{1};
+};
+
+template <typename T>
+class RefCountedPtr {
+ public:
+  constexpr RefCountedPtr() noexcept = default;
+
+  explicit RefCountedPtr(T *ptr, bool inc_ref = false) noexcept : ptr_(ptr) {
+    if (inc_ref && ptr_)
+      ptr_->IncRef();
+  }
+
+  ~RefCountedPtr() {
+    reset();
+  }
+
+  template <typename U, std::enable_if_t<std::is_convertible_v<U *, T *>, int> = 0>
+  RefCountedPtr(const RefCountedPtr<U> &other) noexcept : ptr_(other.ptr_) {
+    if (ptr_)
+      ptr_->IncRef();
+  }
+
+  template <typename U, std::enable_if_t<std::is_convertible_v<U *, T *>, int> = 0>
+  RefCountedPtr(RefCountedPtr<U> &&other) noexcept : ptr_(other.ptr_) {
+    other.ptr_ = nullptr;
+  }
+
+  template <typename U>
+  std::enable_if_t<std::is_convertible_v<U *, T *>, RefCountedPtr> &
+  operator=(const RefCountedPtr<U> &other) noexcept {
+    if (ptr_ == other.ptr_)
+      return *this;
+    if (other.ptr_)
+      other.ptr_->IncRef();
+    ptr_->DecRef();
+    ptr_ = other.ptr_;
+    return *this;
+  }
+
+  template <typename U>
+  std::enable_if_t<std::is_convertible_v<U *, T *>, RefCountedPtr> &
+  operator=(RefCountedPtr &&other) noexcept {
+    if (&other == this)
+      return *this;
+    std::swap(ptr_, other.ptr_);
+    other.reset();
+  }
+
+  void reset() noexcept {
+    if (ptr_)
+      ptr_->DecRef();
+    ptr_= nullptr;
+  }
+
+  [[nodiscard]] T *release() noexcept {
+    T *p = ptr_;
+    ptr_ = nullptr;
+    return p;
+  }
+
+  constexpr T *operator->() const & noexcept { return ptr_; }
+
+  constexpr T &operator*() const & noexcept { return *ptr_; }
+
+  constexpr T *get() const & noexcept { return ptr_; }
+
+ private:
+  template <typename U>
+  friend class RefCountedPtr;
+  T *ptr_ = nullptr;
 };
 
 }  // namespace dali::c_api
diff --git a/include/dali/dali.h b/include/dali/dali.h
index 0d62e26beb..5b2877943b 100644
--- a/include/dali/dali.h
+++ b/include/dali/dali.h
@@ -281,6 +281,9 @@ DALI_API daliResult_t daliPipelineCreate(
   daliPipeline_h *out_pipe_handle,
   const daliPipelineParams_t *params);
 
+/** Destroys a DALI pipeline. */
+DALI_API daliResult_t daliPipelineDestroy(daliPipeline_h pipeline);
+
 /** Creates a DALI pipeline from a serialized one.
  *
  * This function creates and deserializes a pipeline. The parameters are used to override
@@ -555,6 +558,8 @@ typedef struct _DALIBufferPlacement {
   daliStorageDevice_t device_type;
 
   /** CUDA device ordinal, as returned by CUDA runtime API.
+   *
+   * The value of this field is meaningful only if `device_type` is GPU or `pinned` is `true`.
    *
    * WARNING: The device_id returned by NVML (and thus, nvidia-smi) may be different.
    */
@@ -596,6 +601,10 @@ DALI_API daliResult_t daliTensorListResize(
  * @param num_samples     the number of samples in the list
  * @param ndim            the number of dimensions in the sample
  * @param dtype           the element type
+ * @param layout          a layout string describing the order of axes in each sample (e.g. HWC),
+ *                        if NULL, and the TensorList's number of dimensions is equal to `ndim,
+ *                        then the current layout is kept;
+ *                        if `layout` is an empty string, the tensor list's layout is cleared
  * @param shapes          the concatenated shapes of the samples;
  *                        must contain num_samples*ndim extents
  * @param data            the pointer to the data buffer
@@ -608,6 +617,7 @@ DALI_API daliResult_t daliTensorListAttachBuffer(
   int num_samples,
   int ndim,
   daliDataType_t dtype,
+  const char *layout,
   const int64_t *shapes,
   void *data,
   const ptrdiff_t *sample_offsets,
@@ -629,6 +639,10 @@ DALI_API daliResult_t daliTensorListAttachBuffer(
  * @param dtype           the type of the element of the tensor;
  *                        if dtype is DALI_NO_TYPE, then the type is taken from samples[0].dtype;
  *                        if set, the dtype in the samples can be left at -1
+ * @param layout          a layout string describing the order of axes in each sample (e.g. HWC),
+ *                        if NULL, and the TensorList's number of dimensions is equal to `ndim,
+ *                        then the current layout is kept;
+ *                        if `layout` is an empty string, the tensor list's layout is cleared
  * @param samples         the descriptors of the tensors to be attached to the TensorList;
  *                        the `ndim` and `dtype` of the samples must match and they must match the
  *                        values of `ndim` and `dtype` parameters.
@@ -642,6 +656,7 @@ DALI_API daliResult_t daliTensorListAttachSamples(
   int num_samples,
   int ndim,
   daliDataType_t dtype,
+  const char *layout,
   const daliTensorDesc_t *samples,
   const daliDeleter_t *sample_deleters);
 
@@ -699,6 +714,7 @@ DALI_API daliResult_t daliTensorListGetReadyEvent(
  *
  * The function ensures that a readiness event is associated with the tensor list.
  * It can also get the event handle, if the output parameter pointer is not NULL.
+ * The function fails if the tensor list is not associated with a CUDA device.
  */
 DALI_API daliResult_t daliTensorListGetOrCreateReadyEvent(
   daliTensorList_h tensor_list,