diff --git a/dali/c_api_2/data_objects.h b/dali/c_api_2/data_objects.h index bacbe7167f..50eb2c6861 100644 --- a/dali/c_api_2/data_objects.h +++ b/dali/c_api_2/data_objects.h @@ -15,15 +15,19 @@ #ifndef DALI_C_API_2_DATA_OBJECTS_H_ #define DALI_C_API_2_DATA_OBJECTS_H_ +#include +#include #include #include "dali/dali.h" #include "dali/pipeline/data/tensor_list.h" #include "dali/c_api_2/ref_counting.h" +struct _DALITensorList {}; + namespace dali { namespace c_api { -class TensorListInterface : public RefCountedObject { +class TensorListInterface : public _DALITensorList, public RefCountedObject { public: virtual ~TensorListInterface() = default; @@ -37,6 +41,7 @@ class TensorListInterface : public RefCountedObject { int num_samples, int ndim, daliDataType_t dtype, + const char *layout, const int64_t *shapes, void *data, const ptrdiff_t *sample_offsets, @@ -46,6 +51,7 @@ class TensorListInterface : public RefCountedObject { int num_samples, int ndim, daliDataType_t dtype, + const char *layout, const daliTensorDesc_t *samples, const daliDeleter_t *sample_deleters) = 0; @@ -55,12 +61,14 @@ class TensorListInterface : public RefCountedObject { virtual std::optional GetStream() const = 0; - virtual std::optional GetReadyEvent() const() = 0; + virtual std::optional GetReadyEvent() const = 0; virtual cudaEvent_t GetOrCreateReadyEvent() = 0; + + static RefCountedPtr Create(daliBufferPlacement_t placement); }; -struct TensorListDeleter { +struct BufferDeleter { daliDeleter_t deleter; AccessOrder deletion_order; @@ -71,7 +79,7 @@ struct TensorListDeleter { deletion_order.is_device() ? &stream : nullptr); } if (deleter.destroy_context) { - deleter.destroy_context(deleter.destroy_context); + deleter.destroy_context(deleter.deleter_ctx); } } }; @@ -86,38 +94,85 @@ class TensorListWrapper : public TensorListInterface { int ndim, daliDataType_t dtype, const int64_t *shapes) override { - tl_->Resize(TensorListShape<>(make_cspan(shapes, num_samples*ndim), num_samples, ndim), dtype); + std::vector shape_data(shapes, shapes + ndim * num_samples); + tl_->Resize(TensorListShape<>(shape_data, num_samples, ndim), dtype); } void AttachBuffer( int num_samples, int ndim, daliDataType_t dtype, + const char *layout, const int64_t *shapes, void *data, const ptrdiff_t *sample_offsets, daliDeleter_t deleter) override { + + if (num_samples < 0) + throw std::invalid_argument("The number of samples must not be negative."); + if (ndim < 0) + throw std::invalid_argument("The number of dimensions must not be negative."); + if (!shapes && ndim >= 0) + throw std::invalid_argument("The `shapes` are required for non-scalar (ndim>=0) samples."); + if (!data && num_samples > 0) { + for (int i = 0; i < num_samples; i++) { + auto sample_shape = make_cspan(&shapes[i*ndim], ndim); + if (volume(sample_shape) > 0) + throw std::invalid_argument( + "The pointer to the data buffer must not be null for a non-empty tensor list."); + if (sample_offsets && sample_offsets[i]) + throw std::invalid_argument( + "All sample_offsets must be zero when the data pointer is NULL."); + } + } + + TensorLayout new_layout = {}; + + if (!layout) { + if (ndim == tl_->sample_dim()) + new_layout = tl_->GetLayout(); + } else { + new_layout = layout; + if (new_layout.ndim() != ndim) + throw std::invalid_argument(make_string( + "The layout '", new_layout, "' cannot describe ", ndim, "-dimensional data.")); + } + tl_->Reset(); tl_->SetSize(num_samples); tl_->set_sample_dim(ndim); - ptridff_t next_offset = 0; + tl_->SetLayout(new_layout); + ptrdiff_t next_offset = 0; auto type_info = TypeTable::GetTypeInfo(dtype); auto element_size = type_info.size(); - std::shared_ptr buffer; + + std::shared_ptr buffer; if (!deleter.delete_buffer && !deleter.destroy_context) { - buffer.reset(buffer, [](void *){}); + buffer = std::shared_ptr(data, [](void *){}); } else { - buffer.reset(buffer, TensorListDeleter{deleter, order()}); + buffer = std::shared_ptr(data, BufferDeleter{deleter, tl_->order()}); } + for (int i = 0; i < num_samples; i++) { - TensorShape<> sample_shape(make_cspan(&shapes[i*ndim]. ndim)); + TensorShape<> sample_shape(make_cspan(&shapes[i*ndim], ndim)); void *sample_data; + size_t sample_bytes = volume(sample_shape) * element_size; if (sample_offsets) { sample_data = static_cast(data) + sample_offsets[i]; } else { sample_data = static_cast(data) + next_offset; - next_offset += volme(sample_shape) * element_size; + next_offset += sample_bytes; } + tl_->SetSample( + i, + std::shared_ptr(buffer, sample_data), + sample_bytes, + tl_->is_pinned(), + sample_shape, + dtype, + tl_->device_id(), + tl_->order(), + new_layout); } } @@ -125,24 +180,127 @@ class TensorListWrapper : public TensorListInterface { int num_samples, int ndim, daliDataType_t dtype, + const char *layout, const daliTensorDesc_t *samples, const daliDeleter_t *sample_deleters) { + if (num_samples < 0) + throw std::invalid_argument("The number of samples must not be negative."); + if (num_samples > 0 && !samples) + throw std::invalid_argument("The pointer to sample descriptors must not be NULL."); + if (ndim < 0) { + if (num_samples == 0) + throw std::invalid_argument( + "The number of dimensions must not be negative when num_samples is 0."); + else + ndim = samples[0].ndim; + } + + for (int i = 0; i < num_samples; i++) { + if (samples[i].ndim != ndim) + throw std::invalid_argument(make_string( + "Invalid `ndim` at sample ", i, ": got ", samples[i].ndim, ", expected ", ndim, ".")); + if (ndim && !samples[i].shape) + throw std::invalid_argument(make_string("Got NULL shape in sample ", i, ".")); + if (!samples[i].data && volume(make_cspan(samples[i].shape, ndim))) + throw std::invalid_argument(make_string( + "Got NULL data pointer in a non-empty sample ", i, ".")); + } + + TensorLayout new_layout = {}; + + if (!layout) { + if (ndim == tl_->sample_dim()) + new_layout = tl_->GetLayout(); + } else { + new_layout = layout; + if (new_layout.ndim() != ndim) + throw std::invalid_argument(make_string( + "The layout '", new_layout, "' cannot describe ", ndim, "-dimensional data.")); + } + + tl_->Reset(); + tl_->SetSize(num_samples); + tl_->set_sample_dim(ndim); + tl_->SetLayout(new_layout); + + auto deletion_order = tl_->order(); + auto type_info = TypeTable::GetTypeInfo(dtype); + auto element_size = type_info.size(); + for (int i = 0; i < num_samples; i++) { + TensorShape<> sample_shape(make_cspan(samples[i].shape, samples[i].ndim)); + size_t sample_bytes = volume(sample_shape) * element_size; + std::shared_ptr sample_ptr; + if (sample_deleters) { + sample_ptr = std::shared_ptr( + samples[i].data, + BufferDeleter{sample_deleters[i], deletion_order}); + } else { + sample_ptr = std::shared_ptr(samples[i].data, [](void*) {}); + } + + tl_->SetSample( + i, + sample_ptr, + sample_bytes, + tl_->is_pinned(), + sample_shape, + dtype, + tl_->device_id(), + tl_->order(), + new_layout); + } } - virtual daliBufferPlacement_t GetBufferPlacement() const = 0; + daliBufferPlacement_t GetBufferPlacement() const override { + daliBufferPlacement_t placement; + placement.device_id = tl_->device_id(); + StorageDevice dev = backend_to_storage_device::value; + placement.device_type = static_cast(dev); + placement.pinned = tl_->is_pinned(); + return placement; + } - virtual void SetStream(std::optional stream, bool synchronize) = 0; + void SetStream(std::optional stream, bool synchronize) override { + tl_->set_order(stream.has_value() ? AccessOrder(*stream) : AccessOrder::host(), synchronize); + } - virtual std::optional GetStream() const = 0; + std::optional GetStream() const override { + auto o = tl_->order(); + if (o.is_device()) + return o.stream(); + else + return std::nullopt; + } - virtual std::optional GetReadyEvent() const() = 0; + std::optional GetReadyEvent() const override { + auto &e = tl_->ready_event(); + if (e) + return e.get(); + else + return std::nullopt; + } - virtual cudaEvent_t GetOrCreateReadyEvent() = 0; + cudaEvent_t GetOrCreateReadyEvent() override { + auto &e = tl_->ready_event(); + if (e) + return e.get(); + int device_id = tl_->device_id(); + if (device_id < 0) + throw std::runtime_error("The tensor list is not associated with a CUDA device."); + tl_->set_ready_event(CUDASharedEvent::Create(device_id)); + return tl_->ready_event().get(); + } private: - std::shared_ptr> impl_; + std::shared_ptr> tl_; }; +template +RefCountedPtr> Wrap(std::shared_ptr> tl) { + return RefCountedPtr>(new TensorListWrapper(std::move(tl))); +} + + } // namespace c_api } // namespace dali diff --git a/dali/c_api_2/ref_counting.h b/dali/c_api_2/ref_counting.h index 611f4bf8e0..11df801388 100644 --- a/dali/c_api_2/ref_counting.h +++ b/dali/c_api_2/ref_counting.h @@ -15,10 +15,102 @@ #ifndef DALI_C_API_2_REF_COUNTING_H_ #define DALI_C_API_2_REF_COUNTING_H_ +#include +#include +#include + namespace dali::c_api { class RefCountedObject { public: + int IncRef() noexcept { + return std::atomic_fetch_add_explicit(&ref_, 1, std::memory_order_relaxed) + 1; + } + + int DecRef() noexcept { + int ret = std::atomic_fetch_sub_explicit(&ref_, 1, std::memory_order_acq_rel) - 1; + if (!ret) + delete this; + return ret; + } + + int RefCount() const noexcept { + return ref_.load(std::memory_order_relaxed); + } + + virtual ~RefCountedObject() = default; + private: + std::atomic ref_{1}; +}; + +template +class RefCountedPtr { + public: + constexpr RefCountedPtr() noexcept = default; + + explicit RefCountedPtr(T *ptr, bool inc_ref = false) noexcept : ptr_(ptr) { + if (inc_ref && ptr_) + ptr_->IncRef(); + } + + ~RefCountedPtr() { + reset(); + } + + template , int> = 0> + RefCountedPtr(const RefCountedPtr &other) noexcept : ptr_(other.ptr_) { + if (ptr_) + ptr_->IncRef(); + } + + template , int> = 0> + RefCountedPtr(RefCountedPtr &&other) noexcept : ptr_(other.ptr_) { + other.ptr_ = nullptr; + } + + template + std::enable_if_t, RefCountedPtr> & + operator=(const RefCountedPtr &other) noexcept { + if (ptr_ == other.ptr_) + return *this; + if (other.ptr_) + other.ptr_->IncRef(); + ptr_->DecRef(); + ptr_ = other.ptr_; + return *this; + } + + template + std::enable_if_t, RefCountedPtr> & + operator=(RefCountedPtr &&other) noexcept { + if (&other == this) + return *this; + std::swap(ptr_, other.ptr_); + other.reset(); + } + + void reset() noexcept { + if (ptr_) + ptr_->DecRef(); + ptr_= nullptr; + } + + [[nodiscard]] T *release() noexcept { + T *p = ptr_; + ptr_ = nullptr; + return p; + } + + constexpr T *operator->() const & noexcept { return ptr_; } + + constexpr T &operator*() const & noexcept { return *ptr_; } + + constexpr T *get() const & noexcept { return ptr_; } + + private: + template + friend class RefCountedPtr; + T *ptr_ = nullptr; }; } // namespace dali::c_api diff --git a/include/dali/dali.h b/include/dali/dali.h index 0d62e26beb..5efc0695d2 100644 --- a/include/dali/dali.h +++ b/include/dali/dali.h @@ -555,6 +555,8 @@ typedef struct _DALIBufferPlacement { daliStorageDevice_t device_type; /** CUDA device ordinal, as returned by CUDA runtime API. + * + * The value of this field is meaningful only if `device_type` is GPU or `pinned` is `true`. * * WARNING: The device_id returned by NVML (and thus, nvidia-smi) may be different. */ @@ -596,6 +598,10 @@ DALI_API daliResult_t daliTensorListResize( * @param num_samples the number of samples in the list * @param ndim the number of dimensions in the sample * @param dtype the element type + * @param layout a layout string describing the order of axes in each sample (e.g. HWC), + * if NULL, and the TensorList's number of dimensions is equal to `ndim, + * then the current layout is kept; + * if `layout` is an empty string, the tensor list's layout is cleared * @param shapes the concatenated shapes of the samples; * must contain num_samples*ndim extents * @param data the pointer to the data buffer @@ -608,6 +614,7 @@ DALI_API daliResult_t daliTensorListAttachBuffer( int num_samples, int ndim, daliDataType_t dtype, + const char *layout, const int64_t *shapes, void *data, const ptrdiff_t *sample_offsets, @@ -629,6 +636,10 @@ DALI_API daliResult_t daliTensorListAttachBuffer( * @param dtype the type of the element of the tensor; * if dtype is DALI_NO_TYPE, then the type is taken from samples[0].dtype; * if set, the dtype in the samples can be left at -1 + * @param layout a layout string describing the order of axes in each sample (e.g. HWC), + * if NULL, and the TensorList's number of dimensions is equal to `ndim, + * then the current layout is kept; + * if `layout` is an empty string, the tensor list's layout is cleared * @param samples the descriptors of the tensors to be attached to the TensorList; * the `ndim` and `dtype` of the samples must match and they must match the * values of `ndim` and `dtype` parameters. @@ -642,6 +653,7 @@ DALI_API daliResult_t daliTensorListAttachSamples( int num_samples, int ndim, daliDataType_t dtype, + const char *layout, const daliTensorDesc_t *samples, const daliDeleter_t *sample_deleters); @@ -699,6 +711,7 @@ DALI_API daliResult_t daliTensorListGetReadyEvent( * * The function ensures that a readiness event is associated with the tensor list. * It can also get the event handle, if the output parameter pointer is not NULL. + * The function fails if the tensor list is not associated with a CUDA device. */ DALI_API daliResult_t daliTensorListGetOrCreateReadyEvent( daliTensorList_h tensor_list,